Library Loading for the analysis

options(connectionObserver = NULL)

library(ensembldb)
library(EnsDb.Mmusculus.v79)
library(org.Mm.eg.db)
library(AnnotationDbi)
library(data.table)
library(tidyverse)
library(rtracklayer)
library(Rsamtools)
library(DESeq2)
library(Rsubread)
library(VennDiagram)
library(CLIPanalyze)
library(RColorBrewer)
library(gplots)
library(Biostrings)
library(pheatmap)
library(ggrepel)
library(rvest)
library(ggseqlogo)
library(gridExtra)

Analysis for the HF sample CLIP data

Load the rds file from HF_1/2/3 3 sample CLIP analysis and use MA plot to visuaize that sequence depth are balanced between CLIP and IC libraries, and the peaks that were called.

dir.create("PDF_figure", showWarnings = FALSE)

peak.data.hf<- readRDS("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/HEAP-CLIP/HEAP_12232019/Analysis/CLIP_Analysis/Data Visualization/peakdata.HF.rds")

plotMA(peak.data.hf$gene.counts.nopeaks,
       main = "MA plot for HF HEAP vs IC\n in genes outside peaks")

pdf("PDF_figure/MAPlot_HF_HEAPvIC_OutsidePeaks.pdf",
    height = 4,
    width = 5)
plotMA(peak.data.hf$gene.counts.nopeaks,
       main = "MA plot for HF HEAP vs IC\n in genes outside peaks")
dev.off()
## quartz_off_screen 
##                 2
plotMA(peak.data.hf$res.deseq,
       main = "MA plot for HF HEAP vs IC in peaks,\none-sided test")

pdf("PDF_figure/MAPlot_HF_HEAPvIC_InPeaks.pdf",
    height = 4,
    width = 5)
plotMA(peak.data.hf$res.deseq,
       main = "MA plot for HF HEAP vs IC in peaks,\none-sided test")
dev.off()
## quartz_off_screen 
##                 2

Prepare the peak, assign scores to HF peaks by adjusted p-value.

peaks.all.hf <- peak.data.hf$peaks
peaks.all.hf <- subset(peaks.all.hf, width > 20)
peaks.all.hf <- subset(peaks.all.hf, log2FC > 0)
peaks.all.hf <- keepStandardChromosomes(peaks.all.hf)
score(peaks.all.hf) <- -log10(peaks.all.hf$padj)
length(peaks.all.hf)
## [1] 1529
summary(width(peaks.all.hf))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    21.0    48.0    50.0    51.9    54.0   105.0
count.threshold <- 10
norm.counts <- counts(peak.data.hf$peak.counts, normalized = TRUE)
norm.counts <- norm.counts[names(peaks.all.hf), ]
colnames(norm.counts)[1:6] <- c(paste0("HF", 1:3), paste0("HF-IC", 1:3))
selected.peaks <- rowMeans(norm.counts[, paste0("HF", 1:3)]) > count.threshold 
peaks.all.hf <- peaks.all.hf[selected.peaks, ]

length(peaks.all.hf)
## [1] 1412
summary(width(peaks.all.hf))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   21.00   48.00   50.00   51.83   53.00  105.00

Count fraction of peaks in genomic features for top N peaks selected by p-value of enrichment in WT HEAP-CLIP vs. input:

countFeatures <- function(peaks, top.n, ignore.intergenic = FALSE) {
    peak.subset <- peaks
    if (ignore.intergenic) {
        peak.subset <- subset(peak.subset, annot != "intergenic")
    }
    peak.subset <- subset(peak.subset, rank(-peak.subset$score) <= top.n)
    table(peak.subset$annot) / length(peak.subset)
}
npeaks <- seq(1, 15) * 100
feature.frac.hf.peaks <-
    sapply(npeaks, function(x) countFeatures(peaks.all.hf, x))
colnames(feature.frac.hf.peaks) <- npeaks

for (feature in c("utr3", "utr5", "exon", "intron", "intergenic")) {
    plot(colnames(feature.frac.hf.peaks), feature.frac.hf.peaks[feature, ],
         type = "o",
         main = sprintf("fraction of HF peaks in %s", feature),
         xlab = "top N peaks by adjusted p-value",
         ylab = sprintf("fraction of peaks in %s", feature))
}

pdf("PDF_figure/Fraction_of_HF_peaks.pdf",
    height = 4,
    width = 5)
for (feature in c("utr3", "utr5", "exon", "intron", "intergenic")) {
    plot(colnames(feature.frac.hf.peaks), feature.frac.hf.peaks[feature, ],
         type = "o",
         main = sprintf("fraction of HF peaks in %s", feature),
         xlab = "top N peaks by adjusted p-value",
         ylab = sprintf("fraction of peaks in %s", feature))
}
dev.off()
## quartz_off_screen 
##                 2

Select peaks significantly enriched over input and are in 3'UTR for further analysis.

padj.threshold <- 5*1e-2
hf.peaks <- subset(peaks.all.hf, padj < padj.threshold)
score(hf.peaks) <- -log10(hf.peaks$padj)
hf.peaks$input.log2FC <- hf.peaks$log2FC
hf.peaks$input.padj <- hf.peaks$padj
hf.peaks$log2FC <- NULL
hf.peaks$padj <- NULL
saveRDS(hf.peaks, "peaks-HF-selected.rds")
length(hf.peaks)
## [1] 1283
summary(width(hf.peaks))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   21.00   48.00   50.00   51.57   53.00  105.00
peaks.hf.utr3 <- subset(hf.peaks, annot =="utr3")
length(peaks.hf.utr3)
## [1] 536

Analysis for the HFK sample CLIP data

Load the rds file from HFK sample CLIP analysis and use MA plot to visuaize that sequence depth are balanced between CLIP and IC libraries, and the peaks that were called.

peak.data.hfk<- readRDS("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/HEAP-CLIP/HEAP_12232019/Analysis/CLIP_Analysis/Data Visualization/peakdata.HFK.rds")

plotMA(peak.data.hfk$gene.counts.nopeaks,
       main = "MA plot for HFK HEAP vs IC\n in genes outside peaks")

pdf("PDF_figure/MAPlot_HFK_HEAPvIC_OutsidePeaks.pdf",
    height = 4,
    width = 5)
plotMA(peak.data.hfk$gene.counts.nopeaks,
       main = "MA plot for HFK HEAP vs IC\n in genes outside peaks")
dev.off()
## quartz_off_screen 
##                 2
plotMA(peak.data.hfk$res.deseq,
       main = "MA plot for HFK HEAP vs IC in peaks,\none-sided test")

pdf("PDF_figure/MAPlot_HFK_HEAPvIC_InPeaks.pdf",
    height = 4,
    width = 5)
plotMA(peak.data.hfk$res.deseq,
       main = "MA plot for HFK HEAP vs IC in peaks,\none-sided test")
dev.off()
## quartz_off_screen 
##                 2

Prepare the peak, assign scores to KRas peaks by adjusted p-value.

peaks.all.hfk <- peak.data.hfk$peaks
peaks.all.hfk <- subset(peaks.all.hfk, width > 20)
peaks.all.hfk <- subset(peaks.all.hfk, log2FC > 0)
peaks.all.hfk <- keepStandardChromosomes(peaks.all.hfk)
score(peaks.all.hfk) <- -log10(peaks.all.hfk$padj)
length(peaks.all.hfk)
## [1] 11361
summary(width(peaks.all.hfk))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   21.00   52.00   57.00   60.47   65.00  164.00
norm.counts <- counts(peak.data.hfk$peak.counts, normalized = TRUE)
norm.counts <- norm.counts[names(peaks.all.hfk), ]
colnames(norm.counts)[1:6] <- c(paste0("HFK", 1:3), paste0("HFK-IC", 1:3))
selected.peaks <- rowMeans(norm.counts[, paste0("HFK", 1:3)]) > count.threshold 
peaks.all.hfk <- peaks.all.hfk[selected.peaks, ]

length(peaks.all.hfk)
## [1] 6084
summary(width(peaks.all.hfk))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    21.0    51.0    56.0    59.7    63.0   164.0

Count fraction of peaks in genomic features for top N peaks selected by p-value of enrichment in WT HEAP-CLIP vs. input:

countFeatures <- function(peaks, top.n, ignore.intergenic = FALSE) {
    peak.subset <- peaks
    if (ignore.intergenic) {
        peak.subset <- subset(peak.subset, annot != "intergenic")
    }
    peak.subset <- subset(peak.subset, rank(-peak.subset$score) <= top.n)
    table(peak.subset$annot) / length(peak.subset)
}
npeaks <- seq(1, 14) * 500
feature.frac.hfk.peaks <-
    sapply(npeaks, function(x) countFeatures(peaks.all.hfk, x))
colnames(feature.frac.hfk.peaks) <- npeaks

for (feature in c("utr3", "utr5", "exon", "intron", "intergenic")) {
    plot(colnames(feature.frac.hfk.peaks), feature.frac.hfk.peaks[feature, ],
         type = "o",
         main = sprintf("fraction of HFK peaks in %s", feature),
         xlab = "top N peaks by adjusted p-value",
         ylab = sprintf("fraction of peaks in %s", feature))
}

pdf("PDF_figure/Fraction_of_HFK_peaks.pdf",
    height = 4,
    width = 5)
for (feature in c("utr3", "utr5", "exon", "intron", "intergenic")) {
    plot(colnames(feature.frac.hfk.peaks), feature.frac.hfk.peaks[feature, ],
         type = "o",
         main = sprintf("fraction of HFK peaks in %s", feature),
         xlab = "top N peaks by adjusted p-value",
         ylab = sprintf("fraction of peaks in %s", feature))
}
dev.off()

Select peaks significantly enriched over input and are in 3'UTR for further analysis.

padj.threshold <- 5*1e-2
hfk.peaks <- subset(peaks.all.hfk, padj < padj.threshold)
score(hfk.peaks) <- -log10(hfk.peaks$padj)
hfk.peaks$input.log2FC <- hfk.peaks$log2FC
hfk.peaks$input.padj <- hfk.peaks$padj
hfk.peaks$log2FC <- NULL
hfk.peaks$padj <- NULL
saveRDS(hfk.peaks, "peaks-HFK-selected.rds")
length(hfk.peaks)
## [1] 5109
summary(width(hfk.peaks))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   21.00   51.00   56.00   58.92   62.00  164.00
peaks.hfk.utr3 <- subset(hfk.peaks, annot =="utr3")
length(peaks.hfk.utr3)
## [1] 2886

Cross analysis of HF and HFK CLIP results

Venndiagram for mapping all overlapping peaks that have a LFC > 0 and padj < 0.05 between the HF and HFK samples

peaks.overlap <- subsetByOverlaps(hf.peaks, hfk.peaks)
length(peaks.overlap)
## [1] 838
grid.newpage()
draw.pairwise.venn(length(hfk.peaks),
                   length(hf.peaks),
                   length(peaks.overlap),
                   catergory <- c("HEAP-CLIP Peaks \nHFK",
                                  "HEAP-CLIP Peaks \nHF"),
                   lty = "blank",
                   ex.text = FALSE,
                   fill = c("pink", "lightblue"),
                   cat.pos = c(200, 135), cat.dist = 0.08, margin = 0.05,
                   fontfamily = "sans", cat.fontfamily = "sans")

## (polygon[GRID.polygon.11], polygon[GRID.polygon.12], polygon[GRID.polygon.13], polygon[GRID.polygon.14], text[GRID.text.15], text[GRID.text.16], text[GRID.text.17], text[GRID.text.18], text[GRID.text.19])
pdf("PDF_figure/Venn_HF_HFK_peaks.pdf",
    width = 8,
    height = 6)
draw.pairwise.venn(length(hfk.peaks),
                   length(hf.peaks),
                   length(peaks.overlap),
                   catergory <- c("HEAP-CLIP Peaks \nHFK",
                                  "HEAP-CLIP Peaks \nHF"),
                   lty = "blank",
                   ex.text = FALSE,
                   fill = c("pink", "lightblue"),
                   cat.pos = c(200, 135), cat.dist = 0.08, margin = 0.05,
                   fontfamily = "sans", cat.fontfamily = "sans")
## (polygon[GRID.polygon.20], polygon[GRID.polygon.21], polygon[GRID.polygon.22], polygon[GRID.polygon.23], text[GRID.text.24], text[GRID.text.25], text[GRID.text.26], text[GRID.text.27], text[GRID.text.28])
dev.off()
## quartz_off_screen 
##                 2

Venndiagram for mapping overlapping peaks in 3' UTR that have a LFC > 0, padj < 0.05, and are in 3' UTR between the HF and HFK samples

peaks.utr3.overlap <- subsetByOverlaps(peaks.hf.utr3, peaks.hfk.utr3)
length(peaks.utr3.overlap)
## [1] 439
grid.newpage()
draw.pairwise.venn(length(peaks.hfk.utr3),
                  length(peaks.hf.utr3),
                  length(peaks.utr3.overlap),
                  catergory <- c("HEAP-CLIP Peaks \nHFK \nin 3' UTR",
                                 "HEAP-CLIP Peaks \nHF \nin 3' UTR"),
                  lty = "blank",
                  ex.text = FALSE,
                  fill = c("pink", "lightblue"),
                  cat.pos = c(250, 135), cat.dist = 0.1, margin = 0.1,
                  fontfamily = "sans", cat.fontfamily = "sans")

## (polygon[GRID.polygon.29], polygon[GRID.polygon.30], polygon[GRID.polygon.31], polygon[GRID.polygon.32], text[GRID.text.33], text[GRID.text.34], lines[GRID.lines.35], text[GRID.text.36], text[GRID.text.37], text[GRID.text.38])
pdf("PDF_figure/Venn_HF_HFK_peaks_3UTR.pdf",
    width = 8,
    height = 6)
draw.pairwise.venn(length(peaks.hfk.utr3),
                  length(peaks.hf.utr3),
                  length(peaks.utr3.overlap),
                  catergory <- c("HEAP-CLIP Peaks \nHFK \nin 3' UTR",
                                 "HEAP-CLIP Peaks \nHF \nin 3' UTR"),
                  lty = "blank",
                  ex.text = FALSE,
                  fill = c("pink", "lightblue"),
                  cat.pos = c(250, 135), cat.dist = 0.1, margin = 0.1,
                  fontfamily = "sans", cat.fontfamily = "sans")
## (polygon[GRID.polygon.39], polygon[GRID.polygon.40], polygon[GRID.polygon.41], polygon[GRID.polygon.42], text[GRID.text.43], text[GRID.text.44], lines[GRID.lines.45], text[GRID.text.46], text[GRID.text.47], text[GRID.text.48])
dev.off()
## quartz_off_screen 
##                 2

Cross analysis with HEAP-CLIP data and proteomics and transcriptomics data

Idnetify HF and HFK specific peaks

This analysis is done on all peaks from all regions of the genome since I decided not to limit the analysis on 3'UTR peaks.

hf.specific.peaks <- hf.peaks[!hf.peaks %over% hfk.peaks,]
hfk.specific.peaks <- hfk.peaks[!hfk.peaks %over% hf.peaks,]

How many genes targeted by HF and HFK specific peaks are detected in proteomics

Get a list of genes that are targeted by HF and HFK specific miRNA peaks in their 3' UTR regions. Obtain matched Ensembl gene ID for these genes using AnnotationDbi package. Obtain matching Uniprot ID information using AnnotationDbi package as well as BioMart on Ensembl.

# obtain the list of gene names
hf.uniq.peaks <- as.data.frame(hf.specific.peaks)
hfk.uniq.peaks <- as.data.frame(hfk.specific.peaks)

Annotate peaks with Ensembl ID and Uniprot ID

First we need to nnotate each peak with its potential target gene, 3'UTR annotation gets priority.

## HF
hf.uniq.peaks$'target_gene' <- NA
for (i in 1:dim(hf.uniq.peaks)) {
  if (!is.na(hf.uniq.peaks[i,12]) | !is.na(hf.uniq.peaks[i,13])) {
    gene_name <- unique(unlist(hf.uniq.peaks[i,12:13]))
    gene_name <- gene_name[!is.na(gene_name)] 
    hf.uniq.peaks$'target_gene'[i] <- paste(unlist(gene_name), collapse = " ")
  }
  else {
    gene_name <- unique(unlist(hf.uniq.peaks[i,8:11]))
    gene_name <- gene_name[!is.na(gene_name)]
    if (length(gene_name) >0) {
    hf.uniq.peaks$'target_gene'[i] <- paste(unlist(gene_name), collapse = " ")
    }
  }
}
## Warning in 1:dim(hf.uniq.peaks): numerical expression has 2 elements: only the
## first used
hf_gene_list <- as.data.frame(table(hf.uniq.peaks$target_gene))
write.csv(hf_gene_list, 'HF_uniq_target.csv')

## HFK
hfk.uniq.peaks$'target_gene' <- NA
for (i in 1:dim(hfk.uniq.peaks)) {
  if (!is.na(hfk.uniq.peaks[i,12]) | !is.na(hfk.uniq.peaks[i,13])) {
    gene_name <- unique(unlist(hfk.uniq.peaks[i,12:13]))
    gene_name <- gene_name[!is.na(gene_name)] 
    hfk.uniq.peaks$'target_gene'[i] <- paste(unlist(gene_name), collapse = " ")
  }
  else {
    gene_name <- unique(unlist(hfk.uniq.peaks[i,8:11]))
    gene_name <- gene_name[!is.na(gene_name)]
    if (length(gene_name) >0) {
    hfk.uniq.peaks$'target_gene'[i] <- paste(unlist(gene_name), collapse = " ")
    }
  }
}
## Warning in 1:dim(hfk.uniq.peaks): numerical expression has 2 elements: only the
## first used
hfk_gene_list <- as.data.frame(table(hfk.uniq.peaks$target_gene))
write.csv(hfk_gene_list, 'HFK_uniq_target.csv')

Try using AnnotationDbi to convert gene name annotations to Ensembl ID and Uniprot ID

# HF
# Ensembl IDs are annotated using `EnsDb.Mmusculus.v79` package since that is the one that I used for RNA-Seq analysis
annotations_hf <- AnnotationDbi::select(EnsDb.Mmusculus.v79,
                                           keys = as.character(hf_gene_list$Var1),
                                           columns = c("GENEID", "UNIPROTID"),
                                           keytype = "GENENAME")

# Determine the indices for the non-duplicated genes
non_duplicates_idx <- which(duplicated(annotations_hf$UNIPROTID) == FALSE)
non_duplicates_idx <- which(duplicated(annotations_hf$GENEID) == FALSE)

# Return only the non-duplicated genes using indices
annotations_hf <- annotations_hf[non_duplicates_idx, ]

# Check number of NAs returned
is.na(annotations_hf$GENENAME) %>%
  which() %>%
  length()
## [1] 0
# annotate the dataset with Ensembl ID
hf.uniq.peaks <- inner_join(hf.uniq.peaks, annotations_hf, by=c("target_gene"="GENENAME"))
colnames(hf.uniq.peaks)[22] <- "UNIPROTID_Annot" 

# how many genes have Ensembl ID
sum(!is.na(annotations_hf$GENEID))
## [1] 301
# Now I need to acquire a list of UNIPROT ID from Uniprot website retrieval for the same list of target genes
# load the gene name to Uniprot ID conversion table
genename2UniprotID <- read.csv('~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/HEAP-CLIP/HEAP_12232019/Analysis/CLIP_Analysis/Data Visualization/genename2uniprot_HF.csv')
genename2UniprotID$gene_entry <- as.character(genename2UniprotID$gene_entry)
genename2UniprotID$Entry <- as.character(genename2UniprotID$Entry)

hf.uniq.peaks$UNIPROTID_Web <- NA
for (i in 1:dim(hf.uniq.peaks)[1]) {
  peak.index <- grep(paste("^",hf.uniq.peaks$target_gene[i],"$", sep=""), genename2UniprotID$gene_entry)
  if (length(genename2UniprotID$Entry[peak.index]) != 0) {
      hf.uniq.peaks$UNIPROTID_Web[i] <- paste(genename2UniprotID$Entry[peak.index], collapse = " ")
  }
}

# manually annotate peaks that had no matched Uniprot ID in SwissProt, using the top hit with the corresponding gene name
no_id_list <- as.data.frame(dimnames(table(hf.uniq.peaks$target_gene[is.na(hf.uniq.peaks$UNIPROTID_Web)])))
write.csv(no_id_list, "no.UniprotID.list.HF.csv")

# load the new list that has manually input Uniprot IDs for no ID genes and annotate the peak file
no_id_list_ID_hf <- read.csv('~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/HEAP-CLIP/HEAP_12232019/Analysis/CLIP_Analysis/Data Visualization/no.UniprotID.list_2_Uniprot_HF.csv')

for (i in 1:dim(hf.uniq.peaks)[1]) {
  peak.index <- grep(paste("^",hf.uniq.peaks$target_gene[i],"$", sep=""), no_id_list_ID_hf$gene_names)
  if (length(no_id_list_ID_hf$UniprotID[peak.index]) != 0) {
      hf.uniq.peaks$UNIPROTID_Web[i] <- paste(no_id_list_ID_hf$UniprotID[peak.index], collapse = " ")
  }
}

# how many genes have Uniprot ID
dim(table(hf.uniq.peaks$UNIPROTID_Web))[1]
## [1] 227

Now we load the enema model colon tumor proteomics files and see how much overlap we get between HF specific peaks and proteomics.

proteom_quant <- read.csv('~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/Proteomics data/scraped colon/2015-03_HaigisMouseColon8plex_Prot.csv')
proteom_de <- read.csv("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/Proteomics data/scraped colon/ceMS_diff.csv")

# check how many HF specific HEAP targets are detected in proteomics using AnnotationDbi Uniprot Annotation
hf_detected_proteome <- intersect(hf.uniq.peaks$UNIPROTID_Annot, proteom_quant$Protein.Id)
length(hf_detected_proteome)
## [1] 71
# check how many HF specific HEAP targets are detected in proteomics using Uniprot Website Uniprot Annotation
hf_detected_proteome <- intersect(hf.uniq.peaks$UNIPROTID_Web, proteom_quant$Protein.Id)
length(hf_detected_proteome)
## [1] 163

This shows that annotation using Uniprot website ID retrieval is far superior than using AnnotationDbi package annotated Uniprot ID using EnsDb.Mmusculus.v79 database. So I will also use Uniprot website to annotate the HFK target list with Uniprot ID.

So now we annotate the HFK target list with Ensembl ID and Uniprot ID.

# HFK
# Ensembl IDs are annotated using `EnsDb.Mmusculus.v79` package since that is the one that I used for RNA-Seq analysis
annotations_hfk <- AnnotationDbi::select(EnsDb.Mmusculus.v79,
                                           keys = as.character(hfk_gene_list$Var1),
                                           columns = c("GENEID"),
                                           keytype = "GENENAME")

# Determine the indices for the non-duplicated genes
non_duplicates_idx <- which(duplicated(annotations_hfk$UNIPROTID) == FALSE)
non_duplicates_idx <- which(duplicated(annotations_hfk$GENEID) == FALSE)

# Return only the non-duplicated genes using indices
annotations_hfk <- annotations_hfk[non_duplicates_idx, ]

# Check number of NAs returned
is.na(annotations_hfk$GENENAME) %>%
  which() %>%
  length()
## [1] 0
# annotate the dataset with Ensembl ID
hfk.uniq.peaks <- inner_join(hfk.uniq.peaks, annotations_hfk, by=c("target_gene"="GENENAME"))

# Now I need to acquire a list of UNIPROT ID from Uniprot website retrieval for the same list of target genes
# load the gene name to Uniprot ID conversion table
genename2UniprotID_hfk <- read.csv('~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/HEAP-CLIP/HEAP_12232019/Analysis/CLIP_Analysis/Data Visualization/genename2uniprot_HFK.csv')
genename2UniprotID_hfk$gene_entry <- as.character(genename2UniprotID_hfk$gene_entry)
genename2UniprotID_hfk$Entry <- as.character(genename2UniprotID_hfk$Entry)

hfk.uniq.peaks$UNIPROTID <- NA
for (i in 1:dim(hfk.uniq.peaks)[1]) {
  peak.index <- grep(paste("^",hfk.uniq.peaks$target_gene[i],"$", sep=""), genename2UniprotID_hfk$gene_entry)
  if (length(genename2UniprotID_hfk$Entry[peak.index]) != 0) {
      hfk.uniq.peaks$UNIPROTID[i] <- paste(genename2UniprotID_hfk$Entry[peak.index], collapse = " ")
  }
}

# manually annotate peaks that had no matched Uniprot ID in SwissProt, using the top hit with the corresponding gene name
no_id_list_hfk <- as.data.frame(dimnames(table(hfk.uniq.peaks$target_gene[is.na(hfk.uniq.peaks$UNIPROTID)])))
write.csv(no_id_list_hfk, "no.UniprotID.list_HFK.csv")

# load the new list that has manually input Uniprot IDs for no ID genes and annotate the peak file
no_id_list_ID_hfk <- read.csv('~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/HEAP-CLIP/HEAP_12232019/Analysis/CLIP_Analysis/Data Visualization/no.UniprotID.list_2_Uniprot_HFK.csv', col.names = FALSE)
## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## header and 'col.names' are of different lengths
for (i in 1:dim(hfk.uniq.peaks)[1]) {
  peak.index <- grep(paste("^",hfk.uniq.peaks$target_gene[i],"$", sep=""), rownames(no_id_list_ID_hfk))
  if (length(no_id_list_ID_hfk$FALSE.[peak.index]) != 0) {
      hfk.uniq.peaks$UNIPROTID[i] <- paste(no_id_list_ID_hfk$FALSE.[peak.index], collapse = " ")
  }
}

# how many genes have Uniprot ID
length(table(hfk.uniq.peaks$UNIPROTID))
## [1] 2025
# how many genes have Ensembl ID
length(table(hfk.uniq.peaks$GENEID))
## [1] 2043

Now we check how much overlap we get between HFK specific peaks and proteomics.

# check how many HFK specific HEAP targets are detected in proteomics using Uniprot Website Uniprot Annotation
hfk_detected_proteome <- intersect(hfk.uniq.peaks$UNIPROTID, proteom_quant$Protein.Id)
length(hfk_detected_proteome)
## [1] 1445

Examine protein expressions of HF and HFK specific targets.

Now that we have both HF and HFK specific peaks annotated with target genes, EnsemblID and UniprotID, we can check how many of the protein targets are DE in proteomics dataset.

HF-specific targets

For HF specific peaks, I expect the protein expression to increase in KrasG12D samples. So LFC > 0.

# HF
# here we identify hit HF-specific target proteins, p< 0.05, q < 0.1, LFC > 0
protein_index <- c()
for (i in 1:length(hf_detected_proteome)) {
  protein_index <- c(protein_index, grep(paste("^",hf_detected_proteome[i],"$",sep = ""), proteom_de$Protein.Id))
}
hf_target_proteom_de <- proteom_de[protein_index,]
hf_target_proteom_quant <- proteom_quant[protein_index,]

hit_hf_target_proteom_de <- subset(hf_target_proteom_de, hf_target_proteom_de$p_values<0.05 & hf_target_proteom_de$q_values < 0.1 & hf_target_proteom_de$LFC > 0)
hit_hf_target_proteom_quant <- subset(hf_target_proteom_quant, hf_target_proteom_de$p_values<0.05 & hf_target_proteom_de$q_values < 0.1 & hf_target_proteom_de$LFC > 0)
dim(hit_hf_target_proteom_de)[1]
## [1] 34
write.csv(hit_hf_target_proteom_de, "Hit HF target proteins.csv")

Draw heatmaps of all HF-target proteins

suppressMessages(library(mosaic))

# zscore the quantifications
for (i in 1:dim(hf_target_proteom_quant)[1]) {
   hf_target_proteom_quant[i, 5:12]<- zscore(as.numeric(as.character(hf_target_proteom_quant[i, 5:12])))
}

for (i in 1:dim(hit_hf_target_proteom_quant)[1]) {
   hit_hf_target_proteom_quant[i, 5:12]<- zscore(as.numeric(as.character(hit_hf_target_proteom_quant[i, 5:12])))
}

# Draw heatmap for all HF-target proteins
my_palette <- colorRampPalette(c("blue", "white", "red"))(256)
heatmap_matrix <- as.matrix(hf_target_proteom_quant[,5:12])
png('All HF-specific target proteins.png',
    width = 300,
    height = 600,
    res = 100,
    pointsize = 8)
heatmap.2(heatmap_matrix,
          main = "All HF specific\nHEAP target proteins",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
pdf('PDF_figure/Heatmap_All HF-specific target proteins.pdf',
    width = 6,
    height = 10)
heatmap.2(heatmap_matrix,
          main = "All HF specific\nHEAP target proteins",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2

Final output is Heatmap for all HF-specific targets

Draw heatmaps for hit HF protein targets that have p < 0.05, q < 0.1, and LFC > 0

# Draw heatmap for hit HF-target proteins that have p < 0.05, q < 0.1, and LFC > 0
heatmap_matrix <- as.matrix(hit_hf_target_proteom_quant[,5:12])
png('Hit HF-specific target proteins.png',
    width = 300,
    height = 600,
    res = 100,
    pointsize = 8)
par(cex.main = 1)
heatmap.2(heatmap_matrix,
          main = "Hit HF specific\nHEAP target proteins\np<0.05,q<0.1,LFC>0",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
pdf('PDF_figure/Heatmap_Hit HF-specific target proteins.pdf',
    width = 6,
    height = 10)
heatmap.2(heatmap_matrix,
          main = "Hit HF specific\nHEAP target proteins\np<0.05,q<0.1,LFC>0",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2

Final output is Heatmap for hit HF-specific targets

HFK-specific targets

For HFK specific peaks, I expect the protein expression to decrease in KrasG12D samples. So LFC < 0.

# HFK
# here we identify hit HFK-specific target proteins, p< 0.05, q < 0.1, LFC < 0
protein_index <- c()
for (i in 1:length(hfk_detected_proteome)) {
  protein_index <- c(protein_index, grep(paste("^",hfk_detected_proteome[i],"$",sep = ""), proteom_de$Protein.Id))
}
hfk_target_proteom_de <- proteom_de[protein_index,]
hfk_target_proteom_quant <- proteom_quant[protein_index,]

hit_hfk_target_proteom_de <- subset(hfk_target_proteom_de, hfk_target_proteom_de$p_values<0.05 & hfk_target_proteom_de$q_values < 0.1 & hfk_target_proteom_de$LFC < 0)
hit_hfk_target_proteom_quant <- subset(hfk_target_proteom_quant, hfk_target_proteom_de$p_values<0.05 & hfk_target_proteom_de$q_values < 0.1 & hfk_target_proteom_de$LFC < 0)
dim(hit_hfk_target_proteom_de)[1]
## [1] 109
write.csv(hit_hfk_target_proteom_de, "Hit HFK target proteins.csv")

Draw heatmaps of all HFK-specific target proteins

# zscore the quantifications
for (i in 1:dim(hfk_target_proteom_quant)[1]) {
   hfk_target_proteom_quant[i, 5:12]<- zscore(as.numeric(as.character(hfk_target_proteom_quant[i, 5:12])))
}

for (i in 1:dim(hit_hfk_target_proteom_quant)[1]) {
   hit_hfk_target_proteom_quant[i, 5:12]<- zscore(as.numeric(as.character(hit_hfk_target_proteom_quant[i, 5:12])))
}

# Draw heatmap for all HEAP-target proteins
heatmap_matrix <- as.matrix(hfk_target_proteom_quant[,5:12])
png('All HFK-specific target proteins.png',
    width = 600,
    height = 1200,
    res = 200,
    pointsize = 8)
heatmap.2(heatmap_matrix,
          main = "All HFK specific\nHEAP target proteins",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
pdf('PDF_figure/Heatmap_All HFK-specific target proteins.pdf',
    width = 6,
    height = 10)
heatmap.2(heatmap_matrix,
          main = "All HFK specific\nHEAP target proteins",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2

Final output is Heatmap for all HFK-specific targets

Draw heatmaps for hit HFK protein targets that have p < 0.05, q < 0.1, and LFC < 0

# Draw heatmap for hit HFK-target proteins that have p < 0.05, q < 0.1, and LFC < 0
heatmap_matrix <- as.matrix(hit_hfk_target_proteom_quant[,5:12])
png('Hit HFK-specific target proteins.png',
    width = 600,
    height = 1400,
    res = 200,
    pointsize = 8)
par(cex.main = 1)
heatmap.2(heatmap_matrix,
          main = "Hit HFK specific\nHEAP target proteins\np<0.05,q<0.1,LFC<0",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
pdf('PDF_figure/Heatmap_Hit HFK-specific target proteins.pdf',
    width = 6,
    height = 10)
par(cex.main = 1)
heatmap.2(heatmap_matrix,
          main = "Hit HFK specific\nHEAP target proteins\np<0.05,q<0.1,LFC<0",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2

Final output is Heatmap for hit HFK-specific targets

weird HFK-specific targets

It seems like that there are a group of HFK-specific protein targets that somehow have upregulated protein expression. There seem to be a large number of them, even more than hit HFK target proteins. I want to pull them out for further inspection.

# here we identify weird HF-specific target proteins, p< 0.05, q < 0.1, LFC > 0
weird_hfk_target_proteom_de <- subset(hfk_target_proteom_de, hfk_target_proteom_de$p_values<0.05 & hfk_target_proteom_de$q_values < 0.1 & hfk_target_proteom_de$LFC > 0)
weird_hfk_target_proteom_quant <- subset(hfk_target_proteom_quant, hfk_target_proteom_de$p_values<0.05 & hfk_target_proteom_de$q_values < 0.1 & hfk_target_proteom_de$LFC > 0)
dim(weird_hfk_target_proteom_de)[1]
## [1] 337
write.csv(weird_hfk_target_proteom_de, "weird HFK target proteins.csv")

Draw heatmaps for hit HFK protein targets that have p < 0.05, q < 0.1, and LFC < 0

# Draw heatmap for hit HFK-target proteins that have p < 0.05, q < 0.1, and LFC > 0
heatmap_matrix <- as.matrix(weird_hfk_target_proteom_quant[,5:12])
png('Weird HFK-specific target proteins.png',
    width = 300,
    height = 600,
    res = 100,
    pointsize = 8)
par(cex.main = 1)
heatmap.2(heatmap_matrix,
          main = "Weird HFK specific\nHEAP target proteins\np<0.05,q<0.1,LFC>0",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
pdf('PDF_figure/Heatmap_Weird HFK-specific target proteins.pdf',
    width = 6,
    height = 10)
par(cex.main = 1)
heatmap.2(heatmap_matrix,
          main = "Weird HFK specific\nHEAP target proteins\np<0.05,q<0.1,LFC>0",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2

Final output is Heatmap for weird HFK-specific targets

Now lets take a look at transcriptomics

Load the RNA-Seq data.

rna_de <- read.csv("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/RNA-Seq/Mouse colon epithelium/Analysis/Differential Analysis.csv")
rna_quant <- read.csv("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/RNA-Seq/Mouse colon epithelium/Analysis/normalized_raw_gene_counts.csv")

HF-specific targets

How many of the HF-specific targets are detected in transcriptomics?

# check how many HF-specific targets are detected in transcriptomics
hf_detected_rna <- intersect(hf.uniq.peaks$GENEID, rna_quant$X)
length(hf_detected_rna)
## [1] 255
rna_index <- c()
for (i in 1:length(hf_detected_rna)){
  rna_index <- c(rna_index, grep(hf_detected_rna[i], rna_quant$X))
}

hf_target_rna_quant <- rna_quant[rna_index,]
hf_target_rna_de <- rna_de[rna_index,]

# zscoring the RNA count quantifications
for (i in 1:dim(hf_target_rna_quant)[1]) {
  hf_target_rna_quant[i,2:9] <- zscore(as.numeric(as.character(hf_target_rna_quant[i,2:9])))
}

Draw the heapmat for all HF target transcriptome.

heatmap_matrix <- as.matrix(hf_target_rna_quant[,2:9])
png('All HF-specific target RNA.png',
    width = 300,
    height = 600,
    res = 100,
    pointsize = 8)
heatmap.2(heatmap_matrix,
          main = "All HF specific\nHEAP target RNA",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
pdf('PDF_figure/Heatmap_All HF-specific target RNA.pdf',
    width = 6,
    height = 10)
heatmap.2(heatmap_matrix,
          main = "All HF specific\nHEAP target RNA",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2

Final output is Heatmap for all HF target RNA

Examine the transcriptome of hit HF-specific targets

# Annotate hit HF target proteome list with Ensembl ID.
hit_hf_target_proteom_de$Ensembl_ID <- NA
for (i in 1:dim(hit_hf_target_proteom_de)[1]) {
  u2e_index <- grep(hit_hf_target_proteom_de$Protein.Id[i], hf.uniq.peaks$UNIPROTID_Web)
  hit_hf_target_proteom_de$Ensembl_ID[i] <- hf.uniq.peaks$GENEID[u2e_index[1]]
}

# check how many hit HEAP targets are detected in transcriptomics
hit_hf_detected_rna <- intersect(hit_hf_target_proteom_de$Ensembl_ID, rna_quant$X)
length(hit_hf_detected_rna)
## [1] 34
hit_rna_index <- c()
for (i in 1:length(hit_hf_detected_rna)){
  hit_rna_index <- c(hit_rna_index, grep(hit_hf_detected_rna[i], rna_quant$X))
}

hit_hf_target_rna_quant <- rna_quant[hit_rna_index,]
hit_hf_target_rna_de <- rna_de[hit_rna_index,]

# zscoring the RNA count quantifications
for (i in 1:dim(hit_hf_target_rna_quant)[1]) {
  hit_hf_target_rna_quant[i,2:9] <- zscore(as.numeric(as.character(hit_hf_target_rna_quant[i,2:9])))
}

Draw the heapmat for hit HF-specific target transcriptome.

heatmap_matrix <- as.matrix(hit_hf_target_rna_quant[,2:9])
png('Hit HF-specific target RNA.png',
    width = 300,
    height = 600,
    res = 100,
    pointsize = 8)
par(cex.main = 1)
heatmap.2(heatmap_matrix,
          main = "Hit HF specific\nHEAP target RNA",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
pdf('PDF_figure/Heatmap_Hit HF-specific target RNA.pdf',
    width = 6,
    height = 10)
heatmap.2(heatmap_matrix,
          main = "Hit HF specific\nHEAP target RNA",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2

Final output is Heatmap for hit HF target RNA

HFK-specific targets

How many of the HFK-specific targets are detected in transcriptomics?

# check how many HFK-specific targets are detected in transcriptomics
hfk_detected_rna <- intersect(hfk.uniq.peaks$GENEID, rna_quant$X)
length(hfk_detected_rna)
## [1] 1989
rna_index <- c()
for (i in 1:length(hfk_detected_rna)){
  rna_index <- c(rna_index, grep(hfk_detected_rna[i], rna_quant$X))
}

hfk_target_rna_quant <- rna_quant[rna_index,]
hfk_target_rna_de <- rna_de[rna_index,]

# zscoring the RNA count quantifications
for (i in 1:dim(hfk_target_rna_quant)[1]) {
  hfk_target_rna_quant[i,2:9] <- zscore(as.numeric(as.character(hfk_target_rna_quant[i,2:9])))
}

Draw the heapmat for all HF target transcriptome.

heatmap_matrix <- as.matrix(hfk_target_rna_quant[,2:9])
png('All HFK-specific target RNA.png',
    width = 300,
    height = 600,
    res = 100,
    pointsize = 8)
heatmap.2(heatmap_matrix,
          main = "All HFK specific\nHEAP target RNA",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
pdf('PDF_figure/Heatmap_All HFK-specific target RNA.pdf',
    width = 6,
    height = 10)
heatmap.2(heatmap_matrix,
          main = "All HFK specific\nHEAP target RNA",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2

Final output is Heatmap for all HFK target RNA

Examine the transcriptome of hit HFK-specific targets

# Annotate hit HFK target proteome list with Ensembl ID.
hit_hfk_target_proteom_de$Ensembl_ID <- NA
for (i in 1:dim(hit_hfk_target_proteom_de)[1]) {
  u2e_index <- grep(hit_hfk_target_proteom_de$Protein.Id[i], hfk.uniq.peaks$UNIPROTID)
  hit_hfk_target_proteom_de$Ensembl_ID[i] <- hfk.uniq.peaks$GENEID[u2e_index[1]]
}

# check how many hit HEAP targets are detected in transcriptomics
hit_hfk_detected_rna <- intersect(hit_hfk_target_proteom_de$Ensembl_ID, rna_quant$X)
length(hit_hfk_detected_rna)
## [1] 108
hit_rna_index <- c()
for (i in 1:length(hit_hfk_detected_rna)){
  hit_rna_index <- c(hit_rna_index, grep(hit_hfk_detected_rna[i], rna_quant$X))
}

hit_hfk_target_rna_quant <- rna_quant[hit_rna_index,]
hit_hfk_target_rna_de <- rna_de[hit_rna_index,]

# zscoring the RNA count quantifications
for (i in 1:dim(hit_hfk_target_rna_quant)[1]) {
  hit_hfk_target_rna_quant[i,2:9] <- zscore(as.numeric(as.character(hit_hfk_target_rna_quant[i,2:9])))
}

Draw the heapmat for hit HEAP target transcriptome.

heatmap_matrix <- as.matrix(hit_hfk_target_rna_quant[,2:9])
png('Hit HFK-specific target RNA.png',
    width = 300,
    height = 600,
    res = 100,
    pointsize = 8)
par(cex.main = 1)
heatmap.2(heatmap_matrix,
          main = "Hit HFK specific\nHEAP target RNA\npadj",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
pdf('PDF_figure/Heatmap_Hit HFK-specific target RNA.pdf',
    width = 6,
    height = 10)
heatmap.2(heatmap_matrix,
          main = "Hit HFK specific\nHEAP target RNA\npadj",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2

Final output is Heatmap for hit HFK target RNA

examine the transcriptome of werid HFK-specific targets

# Annotate weird HFK target proteome list with Ensembl ID.
weird_hfk_target_proteom_de$Ensembl_ID <- NA
for (i in 1:dim(weird_hfk_target_proteom_de)[1]) {
  u2e_index <- grep(weird_hfk_target_proteom_de$Protein.Id[i], hfk.uniq.peaks$UNIPROTID)
  weird_hfk_target_proteom_de$Ensembl_ID[i] <- hfk.uniq.peaks$GENEID[u2e_index[1]]
}

# check how many hit HEAP targets are detected in transcriptomics
weird_hfk_detected_rna <- intersect(weird_hfk_target_proteom_de$Ensembl_ID, rna_quant$X)
length(weird_hfk_detected_rna)
## [1] 332
weird_rna_index <- c()
for (i in 1:length(weird_hfk_detected_rna)){
  weird_rna_index <- c(weird_rna_index, grep(weird_hfk_detected_rna[i], rna_quant$X))
}

weird_hfk_target_rna_quant <- rna_quant[weird_rna_index,]
weird_hfk_target_rna_de <- rna_de[weird_rna_index,]

# zscoring the RNA count quantifications
for (i in 1:dim(weird_hfk_target_rna_quant)[1]) {
  weird_hfk_target_rna_quant[i,2:9] <- zscore(as.numeric(as.character(weird_hfk_target_rna_quant[i,2:9])))
}

Draw the heapmat for weird HEAP target transcriptome.

heatmap_matrix <- as.matrix(weird_hfk_target_rna_quant[,2:9])
png('Weird HFK-specific target RNA.png',
    width = 300,
    height = 600,
    res = 100,
    pointsize = 8)
par(cex.main = 1)
heatmap.2(heatmap_matrix,
          main = "Weird HFK specific\nHEAP target RNA",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
pdf('PDF_figure/Heatmap_Weird HFK-specific target RNA.pdf',
    width = 6,
    height = 10)
heatmap.2(heatmap_matrix,
          main = "Weird HFK specific\nHEAP target RNA",
          density.info = "none",
          key = TRUE,
          lhei = c(1,7),
          col=my_palette,
          cexCol = 1,
          margins = c(8,2),
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          keysize = 2,
          ylab = "Genes",
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2

Final output is Heatmap for weird HFK target RNA

Correlation between HEAP target proteomic and transcriptomics

Draw Venn Diagram to demonstrate the overlap between hit HF and HFK targets DE protein and DE RNA

hit_hf_target_rna_sig <- subset(hit_hf_target_rna_de, hit_hf_target_rna_de$padj < 0.05 & hit_hf_target_rna_de$log2FoldChange > 0)
dim(hit_hf_target_rna_sig)[1]
## [1] 15
hit_hfk_target_rna_sig <- subset(hit_hfk_target_rna_de, hit_hfk_target_rna_de$padj < 0.05 & hit_hfk_target_rna_de$log2FoldChange < 0)
dim(hit_hfk_target_rna_sig)[1]
## [1] 7
# RNA and protein DE for HF-specific targets
grid.newpage()
draw.pairwise.venn(dim(hit_hf_target_proteom_de)[1],
                   dim(hit_hf_target_rna_sig)[1],
                   dim(hit_hf_target_rna_sig)[1],
                   catergory <- c("Hit HF targets protein\np<0.05,q<0.1,LFC>0",
                                  "Hit HF targets RNA\npadj<0.05,LFC>0"),
                   lty = "blank",
                   ex.text = FALSE,
                   fill = c("pink", "lightblue"),
                   cat.pos = c(200, 135), cat.dist = 0.05, margin = 0.05,
                   fontfamily = "sans", cat.fontfamily = "sans")

## (polygon[GRID.polygon.49], polygon[GRID.polygon.50], polygon[GRID.polygon.51], polygon[GRID.polygon.52], text[GRID.text.53], text[GRID.text.54], text[GRID.text.55], text[GRID.text.56])
pdf('PDF_figure/Venn_ HF-specific target RNA-protein.pdf',
    width = 8,
    height = 6)
draw.pairwise.venn(dim(hit_hf_target_proteom_de)[1],
                   dim(hit_hf_target_rna_sig)[1],
                   dim(hit_hf_target_rna_sig)[1],
                   catergory <- c("Hit HF targets protein\np<0.05,q<0.1,LFC>0",
                                  "Hit HF targets RNA\npadj<0.05,LFC>0"),
                   lty = "blank",
                   ex.text = FALSE,
                   fill = c("pink", "lightblue"),
                   cat.pos = c(200, 135), cat.dist = 0.05, margin = 0.05,
                   fontfamily = "sans", cat.fontfamily = "sans")
## (polygon[GRID.polygon.57], polygon[GRID.polygon.58], polygon[GRID.polygon.59], polygon[GRID.polygon.60], text[GRID.text.61], text[GRID.text.62], text[GRID.text.63], text[GRID.text.64])
dev.off()
## quartz_off_screen 
##                 2
# RNA and protein DE for HFK-specific targets
grid.newpage()
draw.pairwise.venn(dim(hit_hfk_target_proteom_de)[1],
                   dim(hit_hfk_target_rna_sig)[1],
                   dim(hit_hfk_target_rna_sig)[1],
                   catergory <- c("Hit HFK targets protein\np<0.05,q<0.1,LFC<0",
                                  "Hit HFK targets RNA\npadj<0.05,LFC<0"),
                   lty = "blank",
                   ex.text = FALSE,
                   fill = c("pink", "lightblue"),
                   cat.pos = c(200, 135), cat.dist = 0.05, margin = 0.05,
                   fontfamily = "sans", cat.fontfamily = "sans")

## (polygon[GRID.polygon.65], polygon[GRID.polygon.66], polygon[GRID.polygon.67], polygon[GRID.polygon.68], text[GRID.text.69], text[GRID.text.70], text[GRID.text.71], text[GRID.text.72])
pdf('PDF_figure/Venn_ HFK-specific target RNA-protein.pdf',
    width = 8,
    height = 6)
draw.pairwise.venn(dim(hit_hfk_target_proteom_de)[1],
                   dim(hit_hfk_target_rna_sig)[1],
                   dim(hit_hfk_target_rna_sig)[1],
                   catergory <- c("Hit HFK targets protein\np<0.05,q<0.1,LFC<0",
                                  "Hit HFK targets RNA\npadj<0.05,LFC<0"),
                   lty = "blank",
                   ex.text = FALSE,
                   fill = c("pink", "lightblue"),
                   cat.pos = c(200, 135), cat.dist = 0.05, margin = 0.05,
                   fontfamily = "sans", cat.fontfamily = "sans")
## (polygon[GRID.polygon.73], polygon[GRID.polygon.74], polygon[GRID.polygon.75], polygon[GRID.polygon.76], text[GRID.text.77], text[GRID.text.78], text[GRID.text.79], text[GRID.text.80])
dev.off()
## quartz_off_screen 
##                 2

Calculate the correlation coefficient between transcriptome and proteome for all HF and HFK targets.

# annotate the HF and HFK proteome de analysis file with Ensembl ID.
hf_target_proteom_de$Ensembl_ID <- NA
for (i in 1:dim(hf_target_proteom_de)[1]) {
  de_index <- grep(hf_target_proteom_de$Protein.Id[i], hf.uniq.peaks$UNIPROTID_Web)
  hf_target_proteom_de$Ensembl_ID[i] <- hf.uniq.peaks$GENEID[de_index[1]]
}

hfk_target_proteom_de$Ensembl_ID <- NA
for (i in 1:dim(hfk_target_proteom_de)[1]) {
  de_index <- grep(hfk_target_proteom_de$Protein.Id[i], hfk.uniq.peaks$UNIPROTID)
  hfk_target_proteom_de$Ensembl_ID[i] <- hfk.uniq.peaks$GENEID[de_index[1]]
}

# HF
## find out the overlap for HF specific peaks
overlap_hf <- intersect(hf_target_proteom_de$Ensembl_ID, hf_target_rna_de$X)
hf_rna_lfc <- c()
hf_protein_lfc <- c()
hf_overlap_ID <- c()
for (i in 1:length(overlap_hf)) {
  rna_index <- grep(paste("^",overlap_hf[i],"$", sep=""), hf_target_rna_de$X)
  protein_index <- grep(paste("^",overlap_hf[i],"$", sep=""), hf_target_proteom_de$Ensembl_ID)
  if (length(protein_index) == 1) {
      hf_rna_lfc <- c(hf_rna_lfc, hf_target_rna_de$log2FoldChange[rna_index])
      hf_protein_lfc <- c(hf_protein_lfc, hf_target_proteom_de$LFC[protein_index])
      hf_overlap_ID <- c(hf_overlap_ID, overlap_hf[i])
  }
}

hf_overlap_lfc <- as.data.frame(cbind(hf_overlap_ID, hf_rna_lfc, hf_protein_lfc))
rownames(hf_overlap_lfc) <- hf_overlap_lfc$hf_overlap_ID
hf_overlap_lfc <- hf_overlap_lfc[,-1]
hf_overlap_lfc$hf_rna_lfc <- as.numeric(as.character(hf_overlap_lfc$hf_rna_lfc))
hf_overlap_lfc$hf_protein_lfc <- as.numeric(as.character(hf_overlap_lfc$hf_protein_lfc))

## Plot scatterplot and calculate the Spearman Correlation
ggplot(hf_overlap_lfc, aes(x = hf_rna_lfc, y = hf_protein_lfc)) +
  geom_point(data = hf_overlap_lfc, alpha = 0.5, size = 1, color = "black") +
  xlab("All HF-specific target RNA LFC") + ylab("All HF-specific target protein LFC") + 
  labs(title = "Correlation between proteomics and transcriptomics for all HF-specific targets") +
  xlim(-3,3) + ylim(-3,3)

pdf("PDF_figure/Correlation_All HF-specific target_RNA-protein.pdf",
    height = 6,
    width = 8)
ggplot(hf_overlap_lfc, aes(x = hf_rna_lfc, y = hf_protein_lfc)) +
  geom_point(data = hf_overlap_lfc, alpha = 0.5, size = 1, color = "black") +
  xlab("All HF-specific target RNA LFC") + ylab("All HF-specific target protein LFC") + 
  labs(title = "Correlation between proteomics and transcriptomics for all HF-specific targets") +
  xlim(-3,3) + ylim(-3,3)
dev.off()
## quartz_off_screen 
##                 2
## Correlation test
cor.test(hf_overlap_lfc$hf_rna_lfc, hf_overlap_lfc$hf_protein_lfc)
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 8.127, df = 159, p-value = 1.163e-13
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4224687 0.6425741
## sample estimates:
##       cor 
## 0.5417432
# HFK
## find out the overlap for HFK specific peaks
overlap_hfk <- intersect(hfk_target_proteom_de$Ensembl_ID, hfk_target_rna_de$X)
hfk_rna_lfc <- c()
hfk_protein_lfc <- c()
hfk_overlap_ID <- c()
for (i in 1:length(overlap_hfk)) {
  rna_index <- grep(paste("^",overlap_hfk[i],"$", sep=""), hfk_target_rna_de$X)
  protein_index <- grep(paste("^",overlap_hfk[i],"$", sep=""), hfk_target_proteom_de$Ensembl_ID)
  if (length(protein_index) == 1) {
      hfk_rna_lfc <- c(hfk_rna_lfc, hfk_target_rna_de$log2FoldChange[rna_index])
      hfk_protein_lfc <- c(hfk_protein_lfc, hfk_target_proteom_de$LFC[protein_index])
      hfk_overlap_ID <- c(hfk_overlap_ID, overlap_hfk[i])
  }
}

hfk_overlap_lfc <- as.data.frame(cbind(hfk_overlap_ID, hfk_rna_lfc, hfk_protein_lfc))
rownames(hfk_overlap_lfc) <- hfk_overlap_lfc$hfk_overlap_ID
hfk_overlap_lfc <- hfk_overlap_lfc[,-1]
hfk_overlap_lfc$hfk_rna_lfc <- as.numeric(as.character(hfk_overlap_lfc$hfk_rna_lfc))
hfk_overlap_lfc$hfk_protein_lfc <- as.numeric(as.character(hfk_overlap_lfc$hfk_protein_lfc))

## Plot scatterplot and calculate the Spearman Correlation
ggplot(hfk_overlap_lfc, aes(x = hfk_rna_lfc, y = hfk_protein_lfc)) +
  geom_point(data = hfk_overlap_lfc, alpha = 0.5, size = 1, color = "black") +
  xlab("All HFK-specific target RNA LFC") + ylab("All HFK-specific target protein LFC") + 
  labs(title = "Correlation between proteomics and transcriptomics for all HFK-specific targets") +
  xlim(-3,3) + ylim(-3,3)

pdf("PDF_figure/Correlation_All HFK-specific target_RNA-protein.pdf",
    height = 6,
    width = 8)
ggplot(hfk_overlap_lfc, aes(x = hfk_rna_lfc, y = hfk_protein_lfc)) +
  geom_point(data = hfk_overlap_lfc, alpha = 0.5, size = 1, color = "black") +
  xlab("All HFK-specific target RNA LFC") + ylab("All HFK-specific target protein LFC") + 
  labs(title = "Correlation between proteomics and transcriptomics for all HFK-specific targets") +
  xlim(-3,3) + ylim(-3,3)
dev.off()
## quartz_off_screen 
##                 2
## Correlation test
cor.test(hfk_overlap_lfc$hfk_rna_lfc, hfk_overlap_lfc$hfk_protein_lfc)
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 19.143, df = 1426, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4098838 0.4924677
## sample estimates:
##       cor 
## 0.4521443

As that there is an enrichment of protein expression upregulation in HFK-specific targets (as opposed to the expectation of enriched downregulation), I will plot this again, but with significantly increased and decreased protein annotated.

# add p and q from proteomics and padj from RNA-Seq to the LFC dataset
hfk_overlap_lfc$EnsemblID <- rownames(hfk_overlap_lfc)
hfk_overlap_lfc <- as_tibble(hfk_overlap_lfc)
hfk_overlap_lfc <- inner_join(hfk_overlap_lfc, hfk_target_proteom_de[,c(5,6,9)], by = c("EnsemblID"="Ensembl_ID"))
hfk_overlap_lfc <- inner_join(hfk_overlap_lfc, hfk_target_rna_de[,c(1,7)], by = c("EnsemblID"="X"))

# plot the graph
ggplot(hfk_overlap_lfc, aes(x = hfk_rna_lfc, y = hfk_protein_lfc)) +
  geom_point(data = hfk_overlap_lfc, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data=subset(hfk_overlap_lfc, p_values < 0.05 & q_values < 0.1 & hfk_protein_lfc > 0), alpha = 0.5, size = 1, color = "red") + 
  geom_point(data=subset(hfk_overlap_lfc, p_values < 0.05 & q_values < 0.1 & hfk_protein_lfc < 0), alpha = 0.5, size = 1, color = "blue") +
  xlab("All HFK-specific target RNA LFC") + ylab("All HFK-specific target protein LFC") + 
  labs(title = "Correlation between proteomics and transcriptomics for all HFK-specific targets") +
  xlim(-3,3) + ylim(-3,3)

pdf("PDF_figure/Correlation_All HFK-specific target_RNA-protein_DEMarked.pdf",
    height = 6,
    width = 8)
ggplot(hfk_overlap_lfc, aes(x = hfk_rna_lfc, y = hfk_protein_lfc)) +
  geom_point(data = hfk_overlap_lfc, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data=subset(hfk_overlap_lfc, p_values < 0.05 & q_values < 0.1 & hfk_protein_lfc > 0), alpha = 0.5, size = 1, color = "red") + 
  geom_point(data=subset(hfk_overlap_lfc, p_values < 0.05 & q_values < 0.1 & hfk_protein_lfc < 0), alpha = 0.5, size = 1, color = "blue") +
  xlab("All HFK-specific target RNA LFC") + ylab("All HFK-specific target protein LFC") + 
  labs(title = "Correlation between proteomics and transcriptomics for all HFK-specific targets") +
  xlim(-3,3) + ylim(-3,3)
dev.off()
## quartz_off_screen 
##                 2

Calculate the correlation coefficient between transcriptome and proteome for hit HF/HFK targets.

# annotate the proteome de analysis file with Ensembl ID.
hit_hf_target_proteom_de$Ensembl_ID <- NA
for (i in 1:dim(hit_hf_target_proteom_de)[1]) {
  de_index <- grep(hit_hf_target_proteom_de$Protein.Id[i], hf.uniq.peaks$UNIPROTID_Web)
  hit_hf_target_proteom_de$Ensembl_ID[i] <- hf.uniq.peaks$GENEID[de_index[1]]
}

hit_hfk_target_proteom_de$Ensembl_ID <- NA
for (i in 1:dim(hit_hfk_target_proteom_de)[1]) {
  de_index <- grep(hit_hfk_target_proteom_de$Protein.Id[i], hfk.uniq.peaks$UNIPROTID)
  hit_hfk_target_proteom_de$Ensembl_ID[i] <- hfk.uniq.peaks$GENEID[de_index[1]]
}

# HF
## find out the overlap
hit_hf_overlap <- intersect(hit_hf_target_proteom_de$Ensembl_ID, hit_hf_target_rna_de$X)
hit_hf_rna_lfc <- c()
hit_hf_protein_lfc <- c()
hit_hf_overlap_ID <- c()
for (i in 1:length(hit_hf_overlap)) {
  hit_rna_index <- grep(paste("^",hit_hf_overlap[i],"$", sep=""), hit_hf_target_rna_de$X)
  hit_protein_index <- grep(paste("^",hit_hf_overlap[i],"$", sep=""), hit_hf_target_proteom_de$Ensembl_ID)
  if (length(hit_protein_index) == 1) {
      hit_hf_rna_lfc <- c(hit_hf_rna_lfc, hit_hf_target_rna_de$log2FoldChange[hit_rna_index])
      hit_hf_protein_lfc <- c(hit_hf_protein_lfc, hit_hf_target_proteom_de$LFC[hit_protein_index])
      hit_hf_overlap_ID <- c(hit_hf_overlap_ID, hit_hf_overlap[i])
  }
}

hit_hf_overlap_lfc <- as.data.frame(cbind(hit_hf_overlap_ID, hit_hf_rna_lfc, hit_hf_protein_lfc))
rownames(hit_hf_overlap_lfc) <- hit_hf_overlap_lfc$hit_hf_overlap_ID
hit_hf_overlap_lfc <- hit_hf_overlap_lfc[,-1]
hit_hf_overlap_lfc$hit_hf_rna_lfc <- as.numeric(as.character(hit_hf_overlap_lfc$hit_hf_rna_lfc))
hit_hf_overlap_lfc$hit_hf_protein_lfc <- as.numeric(as.character(hit_hf_overlap_lfc$hit_hf_protein_lfc))

## Plot scatterplot and calculate the Spearman Correlation
ggplot(hit_hf_overlap_lfc, aes(x = hit_hf_rna_lfc, y = hit_hf_protein_lfc)) +
  geom_point(data = hit_hf_overlap_lfc, alpha = 0.5, size = 1, color = "black") +
  xlab("Hit HF-specific target RNA LFC") + ylab("Hit HF-specific target protein LFC") + 
  labs(title = "Correlation between proteomics and transcriptomics for hit HF-specific targets") +
  xlim(-3,3) + ylim(-3,3)

pdf("PDF_figure/Correlation_Hit HF-specific target_RNA-protein.pdf",
    height = 6,
    width = 8)
ggplot(hit_hf_overlap_lfc, aes(x = hit_hf_rna_lfc, y = hit_hf_protein_lfc)) +
  geom_point(data = hit_hf_overlap_lfc, alpha = 0.5, size = 1, color = "black") +
  xlab("Hit HF-specific target RNA LFC") + ylab("Hit HF-specific target protein LFC") + 
  labs(title = "Correlation between proteomics and transcriptomics for hit HF-specific targets") +
  xlim(-3,3) + ylim(-3,3)
dev.off()
## quartz_off_screen 
##                 2
## Correlation test
cor.test(hit_hf_overlap_lfc$hit_hf_rna_lfc, hit_hf_overlap_lfc$hit_hf_protein_lfc)
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 3.5941, df = 32, p-value = 0.001079
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2419896 0.7402021
## sample estimates:
##       cor 
## 0.5362708
# HFK
## Hit targets
## find out the overlap
hit_hfk_overlap_lfc <- as.data.frame(hit_hfk_target_proteom_de$Ensembl_ID)
colnames(hit_hfk_overlap_lfc) <- "Ensembl_ID"
hit_hfk_overlap_lfc <- inner_join(hit_hfk_overlap_lfc, hfk_overlap_lfc, by = c("Ensembl_ID" = "EnsemblID"))

## Plot scatterplot and calculate the Spearman Correlation
ggplot(hit_hfk_overlap_lfc, aes(x = hfk_rna_lfc, y = hfk_protein_lfc)) +
  geom_point(data = hit_hfk_overlap_lfc, alpha = 0.5, size = 1, color = "black") +
  xlab("Hit HFK-specific target RNA LFC") + ylab("Hit HFK-specific target protein LFC") + 
  labs(title = "Correlation between proteomics and transcriptomics for hit HFK-specific targets") +
  xlim(-3,3) + ylim(-3,3)

pdf("PDF_figure/Correlation_Hit HFK-specific target_RNA-protein.pdf",
    height = 6,
    width = 8)
ggplot(hit_hfk_overlap_lfc, aes(x = hfk_rna_lfc, y = hfk_protein_lfc)) +
  geom_point(data = hit_hfk_overlap_lfc, alpha = 0.5, size = 1, color = "black") +
  xlab("Hit HFK-specific target RNA LFC") + ylab("Hit HFK-specific target protein LFC") + 
  labs(title = "Correlation between proteomics and transcriptomics for hit HFK-specific targets") +
  xlim(-3,3) + ylim(-3,3)
dev.off()
## quartz_off_screen 
##                 2
## Correlation test
cor.test(hit_hfk_overlap_lfc$hfk_rna_lfc, hit_hfk_overlap_lfc$hfk_protein_lfc)
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 2.5211, df = 106, p-value = 0.01319
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.05117417 0.40846273
## sample estimates:
##      cor 
## 0.237848

Calculate the correlation coefficient between transcriptome and proteome for weird HF/HFK targets.

# HFK
## Weird targets
## find out the overlap
weird_hfk_overlap_lfc <- as.data.frame(weird_hfk_target_proteom_de$Ensembl_ID)
colnames(weird_hfk_overlap_lfc) <- "Ensembl_ID"
weird_hfk_overlap_lfc <- inner_join(weird_hfk_overlap_lfc, hfk_overlap_lfc, by = c("Ensembl_ID" = "EnsemblID"))

## Plot scatterplot and calculate the Spearman Correlation
ggplot(weird_hfk_overlap_lfc, aes(x = hfk_rna_lfc, y = hfk_protein_lfc)) +
  geom_point(data = weird_hfk_overlap_lfc, alpha = 0.5, size = 1, color = "black") +
  xlab("Weird HFK-specific target RNA LFC") + ylab("Weird HFK-specific target protein LFC") + 
  labs(title = "Correlation between proteomics and transcriptomics for Weird HFK-specific targets") +
  xlim(-3,3) + ylim(-3,3)

pdf("PDF_figure/Correlation_weird HFK-specific target_RNA-protein.pdf",
    height = 6,
    width = 8)
ggplot(weird_hfk_overlap_lfc, aes(x = hfk_rna_lfc, y = hfk_protein_lfc)) +
  geom_point(data = weird_hfk_overlap_lfc, alpha = 0.5, size = 1, color = "black") +
  xlab("Weird HFK-specific target RNA LFC") + ylab("Weird HFK-specific target protein LFC") + 
  labs(title = "Correlation between proteomics and transcriptomics for Weird HFK-specific targets") +
  xlim(-3,3) + ylim(-3,3)

dev.off()
## quartz_off_screen 
##                 2
## Correlation test
cor.test(weird_hfk_overlap_lfc$hfk_rna_lfc, weird_hfk_overlap_lfc$hfk_protein_lfc)
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 7.9738, df = 330, p-value = 2.546e-14
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3075967 0.4884339
## sample estimates:
##      cor 
## 0.401927

Identify peaks associated with Hit HF and HFK targets and weird HFK targets

# HF
hit_hf_peak_index <- c()
for (i in 1:dim(hit_hf_target_proteom_de)[1]) {
  hit_hf_peak_index <- c(hit_hf_peak_index, grep(hit_hf_target_proteom_de$Ensembl_ID[i], hf.uniq.peaks$GENEID))
}
hit_hf_peak_list <- hf.uniq.peaks[hit_hf_peak_index,]

# HFK
hit_hfk_peak_index <- c()
for (i in 1:dim(hit_hfk_target_proteom_de)[1]) {
  hit_hfk_peak_index <- c(hit_hfk_peak_index, grep(hit_hfk_target_proteom_de$Ensembl_ID[i], hfk.uniq.peaks$GENEID))
}
hit_hfk_peak_list <- hfk.uniq.peaks[hit_hfk_peak_index,]

# weird HFK peaks
weird_hfk_peak_index <- c()
for (i in 1:dim(weird_hfk_target_proteom_de)[1]) {
  weird_hfk_peak_index <- c(weird_hfk_peak_index, grep(weird_hfk_target_proteom_de$Ensembl_ID[i], hfk.uniq.peaks$GENEID))
}
weird_hfk_peak_list <- hfk.uniq.peaks[weird_hfk_peak_index,]

Putative miRNA candidates for HEAP targets

Kmer enrichment analysis

All analysis is done based on identification of 6mer since nt2-7 position is absolutely essential for miRNA targeting.

Load miRBase mature miRNA fa file and mm10 genome

mirnas <- readRNAStringSet("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/HEAP-CLIP/HEAP_12232019/Analysis/CLIP_Analysis/Data Visualization/mature.fa")
mirnas <- DNAStringSet(mirnas[grepl("mmu-", names(mirnas))])
names(mirnas) <- sapply(strsplit(names(mirnas), " "), "[", 1)
names(mirnas) <- substring(names(mirnas), first = 5)
bsgenome <- load.bsgenome("mm10")

Now we look for enriched 6mers/7mers/8mers in HEAP peaks.

Overlap peaks

First we look at that are the enriched kmers in the overlapping peaks between HF and HFK. These are the miRNA targeting no affected by mutant KRas.

#' Find all miRNA sequences associated with the k-mer
#' 
#' Find all miRNA mature sequences where a reverse complement to the k-mer
#' matches exactly somewhere within the sequence.
#' 
#' @param kmer A single k-mer to search for.
#' @param mirnas DNAStringSet with mnature miRNA sequences (with miRNA names).
#' @param in.seed Logical. If TRUE, only associate with miRNAs where k-mer
#'        occurs in the seed, i.e., within 1st to 8th nt.
#' @param collapse Logical. If TRUE, collapse all output miRNA names and
#'        separate by ",".
#' @return Vector of names of miRNAs associated with the k-mer, or a collapsed
#'         string of names.

# define functions
associateKmerWithMiRNA <- function(kmer, mirnas, in.seed = TRUE,
                                   collapse = TRUE) {
  occurrences <- unlist(vmatchPattern(reverseComplement(DNAString(kmer)),
                                      mirnas))
  if (in.seed) {
    occurrences <- occurrences[end(occurrences) <= 8]
  }
  mirna.names <- sort(names(occurrences))
  if (collapse) {
    mirna.names <- paste0(mirna.names, collapse = ",")
  }
  return(mirna.names)
}

findEnrichedKmersPeaks <- function(peaks, kmer.background, k = 6, n = 50) {
  enriched.kmers <- findKmerEnrich(peaks, k = k, genomeTag = "mm10",
                                   kmer.background = kmer.background)[1:n]
  enriched.kmers <-
    data.table(kmer = names(enriched.kmers),
               enrich = enriched.kmers,
               miR = sapply(names(enriched.kmers),
                            associateKmerWithMiRNA, mirnas = mirnas))
  enriched.kmers
}
# get the table of overlap peaks
overlap.peaks <- as.data.frame(peaks.overlap)

# establish proper background
# for the overlapping peaks, background is all peaks detected in HF and HFK samples, disregarding padj
peaks.all.only.hf <- peaks.all.hf[!peaks.all.hf %over% peaks.all.hfk,]
all.peaks <- c(peaks.all.only.hf, peaks.all.hfk)

# calculate all the kmer backgrounds
k6.background.overlap <- calculateKmerBackground(k=6, genomeTag = "mm10", gr.overlap = all.peaks,
                                         exons.only = TRUE)
k7.background.overlap <- calculateKmerBackground(k=7, genomeTag = "mm10", gr.overlap = all.peaks,
                                         exons.only = TRUE)
k8.background.overlap <- calculateKmerBackground(k=8, genomeTag = "mm10", gr.overlap = all.peaks,
                                         exons.only = TRUE)

# 6mer
overlap.peaks.k6 <- findEnrichedKmersPeaks(peaks.overlap, k6.background.overlap, k=6)
ggplot(overlap.peaks.k6[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 6mers in HF/HFK overlapped peaks")

pdf("PDF_figure/Kmer_overlap_peaks_k6.pdf",
    width = 5,
    height = 4)
ggplot(overlap.peaks.k6[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 6mers in HF/HFK overlapped peaks")
dev.off()
## quartz_off_screen 
##                 2
overlap.peaks.k6[1:10,]
##       kmer   enrich
##  1: TACCTC 3.409097
##  2: GGGGGG 3.284653
##  3: CTACCT 3.098473
##  4: ACCTCA 2.667598
##  5: TGTTAC 2.646359
##  6: CGCACA 2.531632
##  7: TCTACC 2.515210
##  8: ACGCAC 2.379125
##  9: ACCTCG 2.266956
## 10: CAGTAT 2.228120
##                                                                                                                                                        miR
##  1:                            let-7a-5p,let-7b-5p,let-7c-5p,let-7d-5p,let-7e-5p,let-7f-5p,let-7g-5p,let-7i-5p,let-7j,let-7k,miR-1961,miR-202-3p,miR-98-5p
##  2:                                                                                                                                            miR-7082-3p
##  3: let-7a-5p,let-7b-5p,let-7c-5p,let-7d-5p,let-7e-5p,let-7f-5p,let-7g-5p,let-7i-5p,let-7k,miR-1839-5p,miR-1961,miR-196a-5p,miR-196b-5p,miR-3962,miR-98-5p
##  4:                          let-7a-5p,let-7b-5p,let-7c-5p,let-7e-5p,let-7f-5p,let-7g-5p,let-7i-5p,let-7j,let-7k,miR-1961,miR-672-5p,miR-7674-5p,miR-98-5p
##  5:                                                                                                                                             miR-194-5p
##  6:                                                                                                                                  miR-147-3p,miR-210-3p
##  7:                                                                                                         miR-1193-5p,miR-1839-5p,miR-376a-5p,miR-379-5p
##  8:                                                                                                                                             miR-210-3p
##  9:                                                                                                                                             miR-381-5p
## 10:                                                                                                                     miR-200b-3p,miR-200c-3p,miR-429-3p
# 7mers
overlap.peaks.k7 <- findEnrichedKmersPeaks(peaks.overlap, k7.background.overlap, k=7)
ggplot(overlap.peaks.k7[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 7mers in HF/HFK overlapped peaks")

pdf("PDF_figure/Kmer_overlap_peaks_k7.pdf",
    width = 5,
    height = 4)
ggplot(overlap.peaks.k7[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 7mers in HF/HFK overlapped peaks")
dev.off()
## quartz_off_screen 
##                 2
overlap.peaks.k7[1:10,]
##        kmer   enrich
##  1: GGGGGGG 4.171863
##  2: CTACCTC 4.040938
##  3: TACCTCA 3.772608
##  4: TCTACCT 3.389743
##  5: CGCACAA 3.231649
##  6: TACCTCG 3.025730
##  7: CTGTTAC 2.950827
##  8: ACTACCT 2.948568
##  9: AGTATTA 2.931563
## 10: ACCTCAG 2.834097
##                                                                                                           miR
##  1:                                                                                                          
##  2: let-7a-5p,let-7b-5p,let-7c-5p,let-7d-5p,let-7e-5p,let-7f-5p,let-7g-5p,let-7i-5p,let-7k,miR-1961,miR-98-5p
##  3:    let-7a-5p,let-7b-5p,let-7c-5p,let-7e-5p,let-7f-5p,let-7g-5p,let-7i-5p,let-7j,let-7k,miR-1961,miR-98-5p
##  4:                                                                                               miR-1839-5p
##  5:                                                                                                          
##  6:                                                                                                          
##  7:                                                                                                miR-194-5p
##  8:                                                                          miR-196a-5p,miR-196b-5p,miR-3962
##  9:                                                                        miR-200b-3p,miR-200c-3p,miR-429-3p
## 10:
# 8mers
overlap.peaks.k8 <- findEnrichedKmersPeaks(peaks.overlap, k8.background.overlap, k=8)
ggplot(overlap.peaks.k8[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 8mers in HF/HFK overlapped peaks")

pdf("PDF_figure/Kmer_overlap_peaks_k8.pdf",
    width = 5,
    height = 4)
ggplot(overlap.peaks.k8[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 8mers in HF/HFK overlapped peaks")
dev.off()
## quartz_off_screen 
##                 2
overlap.peaks.k8[1:10,]
##         kmer   enrich
##  1: GGGGGGGG 4.578830
##  2: CTACCTCA 4.161117
##  3: TCTACCTC 4.094011
##  4: TACCTCAG 3.839490
##  5: CAGTATTA 3.425774
##  6: GCTACCTC 3.373039
##  7: ACTACCTC 3.342690
##  8: TACCTCAA 3.215100
##  9: CCTACCTC 3.198930
## 10: ATCTACCT 3.187035
##                                                                                                 miR
##  1:                                                                                                
##  2: let-7a-5p,let-7b-5p,let-7c-5p,let-7e-5p,let-7f-5p,let-7g-5p,let-7i-5p,let-7k,miR-1961,miR-98-5p
##  3:                                                                                                
##  4:                                                                                                
##  5:                                                              miR-200b-3p,miR-200c-3p,miR-429-3p
##  6:                                                                                                
##  7:                                                                                                
##  8:                                                                                                
##  9:                                                                                                
## 10:

As a sanity check, we need to compare the above enrichment with enrichment of k-mers in whole transcript sequences (because the composition of peak position is ~50% 3'UTR, ~30% exon and ~20% intron).

findEnrichedKmersSeqs <- function(seqs, kmer.background, k = 6, n = 50) {
  enriched.kmers <- findKmerEnrich(gr.seq = seqs, k = k, genomeTag = "mm10",
                                   kmer.background = kmer.background)[1:n]
  enriched.kmers <-
    data.table(kmer = names(enriched.kmers),
               enrich = enriched.kmers,
               miR = sapply(names(enriched.kmers),
                            associateKmerWithMiRNA, mirnas = mirnas))
  enriched.kmers
}

mm10.annot <- loadAnnot("mm10")
mm10.transcript <- transcriptsBy(mm10.annot$txdb)
mm10.transcript <- unlist(mm10.transcript)
mm10.withpeaks <- subsetByOverlaps(mm10.transcript, all.peaks)
mm10.withpeaks.seq <- get.seqs(bsgenome, mm10.withpeaks)

# 6mer
transcript.overlap.k6 <- findEnrichedKmersSeqs(mm10.withpeaks.seq, k6.background.overlap)
ggplot(transcript.overlap.k6[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(transcript / background)") +
  labs(title = "Enriched 6mers in all transcripts with peaks")

pdf("PDF_figure/Kmer_background_overlap_peaks_k6.pdf",
    width = 5,
    height = 4)
ggplot(transcript.overlap.k6[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(transcript / background)") +
  labs(title = "Enriched 6mers in all transcripts with peaks")
dev.off()
## quartz_off_screen 
##                 2
# 7mer
transcript.overlap.k7 <- findEnrichedKmersSeqs(mm10.withpeaks.seq, k7.background.overlap, k=7)
ggplot(transcript.overlap.k7[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(transcript / background)") +
  labs(title = "Enriched 7mers in all transcripts with peaks")

pdf("PDF_figure/Kmer_background_overlap_peaks_k7.pdf",
    width = 5,
    height = 4)
ggplot(transcript.overlap.k7[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(transcript / background)") +
  labs(title = "Enriched 7mers in all transcripts with peaks")
dev.off()
## quartz_off_screen 
##                 2
# 8mer
transcript.overlap.k8 <- findEnrichedKmersSeqs(mm10.withpeaks.seq, k8.background.overlap, k=8)
ggplot(transcript.overlap.k8[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(transcript / background)") +
  labs(title = "Enriched 8mers in all transcripts with peaks")

pdf("PDF_figure/Kmer_background_overlap_peaks_k8.pdf",
    width = 5,
    height = 4)
ggplot(transcript.overlap.k8[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(transcript / background)") +
  labs(title = "Enriched 8mers in all transcripts with peaks")
dev.off()
## quartz_off_screen 
##                 2

HOMER Run on 7mer motifs

findKmerWindows <- function(kmers, peaks,
                            peaks.seq) {
    sapply(kmers,
           function(kmer) {
               positions <- findKmerPositions(gr.seq = peaks.seq, kmer = kmer)
               positions <- mapPositionsToGenome(positions, peaks)
               positions
           },
           simplify = FALSE)
}

prepareHOMERinput <- function(kmer.results, peaks, peaks.seq, filename.tag,
                              n.kmers = 50, width = 15) {
    kmer.positions <- findKmerWindows(kmer.results[1:n.kmers, kmer], peaks = peaks, peaks.seq = peaks.seq)
    kmer.positions <- unlist(GRangesList(kmer.positions))
    kmer.positions <- resize(kmer.positions, fix = "center", width = width)
    kmer.positions <- GenomicRanges::reduce(kmer.positions)
    kmer.positions.filename <-
        sprintf("Kmer/%s-peaks-k7-windows.bed", filename.tag)
    rtracklayer::export(kmer.positions, kmer.positions.filename)
    print("info on k-mer positions (count and width distribution):")
    print(length(kmer.positions))
    print(summary(width(kmer.positions)))
    
    kmer.background <-
        c(shift(kmer.positions, 100),
          shift(kmer.positions, 200),
          shift(kmer.positions, -100),
          shift(kmer.positions, -200))
    kmer.background <- GenomicRanges::reduce(kmer.background)
    kmer.background <- kmer.background[kmer.background %outside% peaks]
    print("info on background positions (count and width distribution):")
    print(length(kmer.background))
    print(summary(width(kmer.background)))
    kmer.background.filename <-
        sprintf("Kmer//%s-peaks-k7-windows-background.bed",
                filename.tag)
    rtracklayer::export(kmer.background, kmer.background.filename)
}
overlap.peaks.seq <- get.seqs(bsgenome, peaks.overlap)


## HOMER run of K7 kmer
prepareHOMERinput(kmer.results = overlap.peaks.k7, 
                  peaks = peaks.overlap, 
                  peaks.seq = overlap.peaks.seq,
                  filename.tag = "overlap",
                  n.kmers = 50,
                  width = 15)
## [1] "info on k-mer positions (count and width distribution):"
## [1] 575
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   15.00   15.00   16.00   17.71   18.00   53.00 
## [1] "info on background positions (count and width distribution):"
## [1] 2246
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   15.00   15.00   16.00   17.77   18.00   53.00
genomic.regions <- "Kmer/overlap-peaks-k7-windows.bed"
background.regions <- "Kmer/overlap-peaks-k7-windows-background.bed"
homerdir <- "Kmer/homer-denovo-output-overlap-peaks-k7"
dir.create(homerdir, showWarnings = FALSE)
homer.cmd <- sprintf("findMotifsGenome.pl %s mm10 %s -bg %s -len 8 -size given -rna -noweight -minlp -5 -nlen 2 -N 200000 -bits -p 10 -cache 1000 >.homer-output 2>.err.homer-output",
                     genomic.regions, homerdir, background.regions)
## Run in terminal
## system(homer.cmd)

print(homer.cmd)
## [1] "findMotifsGenome.pl Kmer/overlap-peaks-k7-windows.bed mm10 Kmer/homer-denovo-output-overlap-peaks-k7 -bg Kmer/overlap-peaks-k7-windows-background.bed -len 8 -size given -rna -noweight -minlp -5 -nlen 2 -N 200000 -bits -p 10 -cache 1000 >.homer-output 2>.err.homer-output"
print.miRNA.Motif.HOMER <- function(mirna.table, file, score = 6) {
    # write motif to file according to HOMER format
    tmp <- apply(mirna.table, 1, function(x) {
        mirna.id <- x["MiRBase.ID"]
        mirna.family <- x["miR.family"]
        mirna.seq <- x["seed"]
        mirna.name <- sprintf("%s (%s)", mirna.family, mirna.id)
        mirna.seq.df <- as.data.frame(strsplit(mirna.seq, "")[[1]])
        colnames(mirna.seq.df) <- "char"
        mirna.seq.df$A <- ifelse(mirna.seq.df$char == "A", 0.997, 0.001)
        mirna.seq.df$C <- ifelse(mirna.seq.df$char == "C", 0.997, 0.001)
        mirna.seq.df$G <- ifelse(mirna.seq.df$char == "G", 0.997, 0.001)
        mirna.seq.df$T <- ifelse(mirna.seq.df$char == "T", 0.997, 0.001)
        mirna.seq.mat <- as.matrix(mirna.seq.df[2:5])
        write(sprintf(">%s\t%s\t%s", mirna.seq, mirna.name, score), file,
              append = TRUE)
        write.table(format(mirna.seq.mat, digits = 4), file, sep = "\t",
                    quote = FALSE, row.names = FALSE, col.names = FALSE,
                    append = TRUE)
    })
}

Define functions that will do the parsing and plotting of the results.

plotHomerResults <- function(homer.table, homer.pwms, n.motifs = 20) {
    ncol <- 4
    laymat <- matrix(1:((1 + n.motifs) * ncol), ncol = ncol, byrow = FALSE)
    logos.list <- lapply(homer.pwms[1:n.motifs],
                         function(pwm) {
                             ggseqlogo(pwm) +
                                 theme(axis.text.x = element_blank(),
                                       axis.title.y = element_blank(),
                                       axis.line.y =
                                           element_line(color = "gray"),
                                       axis.ticks.y =
                                           element_line(color = "gray")) +
                                 ylim(0, 2)
                         })
    ranks.text <- sapply(homer.table$Rank[1:n.motifs], textGrob,
                         simplify = FALSE)
    pval.text <- sapply(sprintf("%.0f", -homer.table$log10.p)[1:n.motifs],
                        textGrob, simplify = FALSE)
    targ.text <- sapply(homer.table$freq.targets[1:n.motifs], textGrob,
                        simplify = FALSE)
    bg.text <- sapply(homer.table$freq.bg[1:n.motifs], textGrob,
                      simplify = FALSE)
    tf.text <- sapply(homer.table$best.match.simple[1:n.motifs], textGrob,
                      simplify = FALSE)
    headers <- sapply(c("rank", "motif", "-log10\np-value",
                        "freq.\ntargets", "freq.\nbackgr.",
                        "best match"),
                      textGrob,
                      simplify = FALSE)
    all.plots <- c(headers[1], ranks.text,
                   headers[2], logos.list,
                   headers[3], pval.text,
                   # headers[4], targ.text,
                   # headers[5], bg.text,
                   headers[6], tf.text)
    grid.arrange(grobs = all.plots, layout_matrix = laymat,
                 # widths = c(1, 4, 1, 1, 1, 3),
                 widths = c(1, 4, 1, 3),
                 ncol = ncol)
}

loadPWM <- function(filename) {
    motif <- fread(filename, skip = 1)
    if (nrow(motif) > 0) {
        pwm <- t(as.matrix(as.data.frame(motif)))
        rownames(pwm) <- c("A", "C", "G", "U")
        pwm
    } else {
        NULL
    }
}

loadHomerResults <- function(dirname) {
    # also allow version="extended" for motifs without stringent
    #   similarity filtering
    homer.table <-
        html_nodes(read_html(sprintf("%s/homerResults.html", dirname)), "table")
    homer.table <- html_table(homer.table, header = TRUE)[[1]]
    homer.table <- data.table(homer.table, check.names = TRUE)
    homer.table <- homer.table[, .(Rank,
                                   log10.p = log.P.pvalue / log(10),
                                   freq.targets = X..of.Targets,
                                   freq.bg = X..of.Background,
                                   best.match = Best.Match.Details)]
    homer.table[, filename := sprintf("%s/homerResults/motif%s.motif",
                                      dirname, seq_along(Rank))]
    homer.pwms <- sapply(homer.table$filename, loadPWM, simplify = FALSE)
    list(homer.table, homer.pwms)
}
overlap.peaks.homer.res <- loadHomerResults("Kmer/homer-denovo-output-overlap-peaks-k7")
human.mirnas <-
    sapply(strsplit(overlap.peaks.homer.res[[1]]$best.match, " "), "[", 1)
print(human.mirnas)
##  [1] "hsa-miR-4500" "hsa-miR-194"  "hsa-miR-29c"  "hsa-miR-200b" "hsa-miR-30a" 
##  [6] "hsa-miR-22"   "hsa-miR-24"   "hsa-miR-210"  "hsa-miR-19a"  "hsa-miR-1203"
## [11] "hsa-miR-4258"
mouse.mirnas <- c("let-7/miR-98", "miR-194", "miR-29", "miR-200", "miR-30", "miR-22", "miR-24", "miR-210", "miR-19a", "miR-1203", "hsa-miR-4258")
overlap.peaks.homer.res[[1]]$best.match.simple <- mouse.mirnas
plotHomerResults(overlap.peaks.homer.res[[1]],
                 overlap.peaks.homer.res[[2]], n.motifs = 8)

pdf("PDF_figure/Homer_overlap_peaks.pdf",
    width = 6,
    height = 10)
plotHomerResults(overlap.peaks.homer.res[[1]],
                 overlap.peaks.homer.res[[2]], n.motifs = 8)
dev.off()
## quartz_off_screen 
##                 2

HF peaks

All HF peaks

Now we look at that are the enriched kmers in the all HF peaks.

# calculate all the kmer backgrounds
k6.background.hf <- calculateKmerBackground(k=6, genomeTag = "mm10", gr.overlap = peaks.all.hf,
                                         exons.only = TRUE)
k7.background.hf <- calculateKmerBackground(k=7, genomeTag = "mm10", gr.overlap = peaks.all.hf,
                                         exons.only = TRUE)
k8.background.hf <- calculateKmerBackground(k=8, genomeTag = "mm10", gr.overlap = peaks.all.hf,
                                         exons.only = TRUE)

# 6mer
hf.peaks.k6 <- findEnrichedKmersPeaks(hf.peaks, k6.background.hf, k=6)
ggplot(hf.peaks.k6[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 6mers in all HF peaks")

pdf("PDF_figure/Kmer_all_HF_peaks_k6.pdf",
    width = 5,
    height = 4)
ggplot(hf.peaks.k6[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 6mers in all HF peaks")
dev.off()
## quartz_off_screen 
##                 2
hf.peaks.k6[1:10,]
##       kmer   enrich
##  1: GAGAGA 4.514729
##  2: AGAGAG 4.417459
##  3: GGGGGG 2.952205
##  4: TACCTC 2.776311
##  5: CTACCT 2.545049
##  6: TGTTAC 2.274778
##  7: TAGTAC 2.252498
##  8: CGCACA 2.250982
##  9: ACCTCA 2.192602
## 10: GAGAGG 2.121025
##                                                                                                                                                        miR
##  1:                                                                                                          miR-3473c,miR-6896-3p,miR-6975-3p,miR-7649-3p
##  2:                                                                                                                      miR-1896,miR-6973b-3p,miR-7063-3p
##  3:                                                                                                                                            miR-7082-3p
##  4:                            let-7a-5p,let-7b-5p,let-7c-5p,let-7d-5p,let-7e-5p,let-7f-5p,let-7g-5p,let-7i-5p,let-7j,let-7k,miR-1961,miR-202-3p,miR-98-5p
##  5: let-7a-5p,let-7b-5p,let-7c-5p,let-7d-5p,let-7e-5p,let-7f-5p,let-7g-5p,let-7i-5p,let-7k,miR-1839-5p,miR-1961,miR-196a-5p,miR-196b-5p,miR-3962,miR-98-5p
##  6:                                                                                                                                             miR-194-5p
##  7:                                                                                                                                                       
##  8:                                                                                                                                  miR-147-3p,miR-210-3p
##  9:                          let-7a-5p,let-7b-5p,let-7c-5p,let-7e-5p,let-7f-5p,let-7g-5p,let-7i-5p,let-7j,let-7k,miR-1961,miR-672-5p,miR-7674-5p,miR-98-5p
## 10:                                                                                                                                            miR-7032-3p
# 7mers
hf.peaks.k7 <- findEnrichedKmersPeaks(hf.peaks, k7.background.hf, k=7)
ggplot(hf.peaks.k7[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 7mers in all HF peaks")

pdf("PDF_figure/Kmer_all_HF_peaks_k7.pdf",
    width = 5,
    height = 4)
ggplot(hf.peaks.k7[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 7mers in all HF peaks")
dev.off()
## quartz_off_screen 
##                 2
hf.peaks.k7[1:10,]
##        kmer   enrich
##  1: GAGAGAG 5.173576
##  2: AGAGAGA 5.064393
##  3: GGGGGGG 3.988259
##  4: CTACCTC 3.325840
##  5: TTAGTAC 3.205786
##  6: TACCTCA 3.188136
##  7: GTTAGTA 3.178083
##  8: GGTTAGT 2.947037
##  9: CGCACAA 2.936837
## 10: TAGTACT 2.924370
##                                                                                                           miR
##  1:                                                                                                          
##  2:                                                                                                          
##  3:                                                                                                          
##  4: let-7a-5p,let-7b-5p,let-7c-5p,let-7d-5p,let-7e-5p,let-7f-5p,let-7g-5p,let-7i-5p,let-7k,miR-1961,miR-98-5p
##  5:                                                                                                          
##  6:    let-7a-5p,let-7b-5p,let-7c-5p,let-7e-5p,let-7f-5p,let-7g-5p,let-7i-5p,let-7j,let-7k,miR-1961,miR-98-5p
##  7:                                                                                                          
##  8:                                                                                                          
##  9:                                                                                                          
## 10:
# 8mers
hf.peaks.k8 <- findEnrichedKmersPeaks(hf.peaks, k8.background.hf, k=8)
ggplot(hf.peaks.k8[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 8mers in all HF peaks")

pdf("PDF_figure/Kmer_all_HF_peaks_k8.pdf",
    width = 5,
    height = 4)
ggplot(hf.peaks.k8[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 8mers in all HF peaks")
dev.off()
## quartz_off_screen 
##                 2
hf.peaks.k8[1:10,]
##         kmer   enrich
##  1: AGAGAGAG 5.462941
##  2: GAGAGAGA 5.452874
##  3: GGGGGGGG 4.475990
##  4: GAGAGAGG 3.905039
##  5: GTTAGTAC 3.594828
##  6: GGTTAGTA 3.549057
##  7: CTACCTCA 3.484796
##  8: TTAGTACT 3.468111
##  9: TAGTACTT 3.418828
## 10: GAGAGGGA 3.358441
##                                                                                                 miR
##  1:                                                                                                
##  2:                                                                                                
##  3:                                                                                                
##  4:                                                                                                
##  5:                                                                                                
##  6:                                                                                                
##  7: let-7a-5p,let-7b-5p,let-7c-5p,let-7e-5p,let-7f-5p,let-7g-5p,let-7i-5p,let-7k,miR-1961,miR-98-5p
##  8:                                                                                                
##  9:                                                                                                
## 10:

HOMER run

all.hf.peaks.seq <- get.seqs(bsgenome, peaks.all.hf)

## HOMER run of K7 kmer
prepareHOMERinput(kmer.results = hf.peaks.k7, 
                  peaks = peaks.all.hf, 
                  peaks.seq = all.hf.peaks.seq,
                  filename.tag = "all.hf",
                  n.kmers = 50,
                  width = 15)
## [1] "info on k-mer positions (count and width distribution):"
## [1] 731
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   15.00   15.00   16.00   22.07   19.00   84.00 
## [1] "info on background positions (count and width distribution):"
## [1] 2861
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   15.00   15.00   16.00   22.23   20.00   84.00
genomic.regions <- "Kmer/all.hf-peaks-k7-windows.bed"
background.regions <- "Kmer/all.hf-peaks-k7-windows-background.bed"
homerdir <- "Kmer/homer-denovo-output-all.hf-peaks-k7"
dir.create(homerdir, showWarnings = FALSE)
homer.cmd <- sprintf("findMotifsGenome.pl %s mm10 %s -bg %s -len 8 -size given -rna -noweight -minlp -5 -nlen 2 -N 200000 -bits -p 10 -cache 1000 >.homer-output 2>.err.homer-output",
                     genomic.regions, homerdir, background.regions)
## Run in terminal
## system(homer.cmd)
print(homer.cmd)
## [1] "findMotifsGenome.pl Kmer/all.hf-peaks-k7-windows.bed mm10 Kmer/homer-denovo-output-all.hf-peaks-k7 -bg Kmer/all.hf-peaks-k7-windows-background.bed -len 8 -size given -rna -noweight -minlp -5 -nlen 2 -N 200000 -bits -p 10 -cache 1000 >.homer-output 2>.err.homer-output"
all.hf.peaks.homer.res <- loadHomerResults(homerdir)
human.mirnas <-
    sapply(strsplit(all.hf.peaks.homer.res[[1]]$best.match, " "), "[", 1)
print(human.mirnas)
##  [1] "hsa-miR-4500"    "hsa-miR-1278"    "hsa-miR-802"     "hsa-miR-516a-3p"
##  [5] "hsa-miR-27b*"    "hsa-miR-4524"    "hsa-miR-147b"    "hsa-miR-383"    
##  [9] "hsa-miR-1297"    "hsa-miR-3183"    "hsa-miR-4454"    "hsa-miR-4707-3p"
## [13] "hsa-miR-629*"
mouse.mirnas <- c("let-7/miR-98", "miR-1278", "miR-194", "miR-516", "miR-27", "miR-29", "miR-210", "miR-383", "miR-26/1297/4465", "miR-3183/4723/6769", "miR-4454", "hsa-miR-4707-3p", "hsa-miR-629*")
all.hf.peaks.homer.res[[1]]$best.match.simple <- mouse.mirnas
plotHomerResults(all.hf.peaks.homer.res[[1]],
                 all.hf.peaks.homer.res[[2]], n.motifs = 8)

pdf("PDF_figure/Homer_all_HF_peaks.pdf",
    width = 6,
    height = 10)
plotHomerResults(all.hf.peaks.homer.res[[1]],
                 all.hf.peaks.homer.res[[2]], n.motifs = 8)
dev.off()
## quartz_off_screen 
##                 2
All HF-specific peaks
# 6mer
hf.specific.peaks.k6 <- findEnrichedKmersPeaks(hf.specific.peaks, k6.background.hf, k=6)
ggplot(hf.specific.peaks.k6[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 6mers in all HF-specific peaks")

pdf("PDF_figure/Kmer_HF_specific_peaks_k6.pdf",
    width = 5,
    height = 4)
ggplot(hf.specific.peaks.k6[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 6mers in all HF-specific peaks")
dev.off()
## quartz_off_screen 
##                 2
hf.specific.peaks.k6[1:10,]
##       kmer   enrich
##  1: GAGAGA 5.910604
##  2: AGAGAG 5.815523
##  3: GAGAGG 3.265633
##  4: TAGTAC 3.205425
##  5: GAGGGA 3.143947
##  6: GGTTAG 3.094367
##  7: GTTAGT 2.951913
##  8: GGAGAG 2.898943
##  9: AGGGAG 2.866323
## 10: GGGAGA 2.781981
##                                                                 miR
##  1:                   miR-3473c,miR-6896-3p,miR-6975-3p,miR-7649-3p
##  2:                               miR-1896,miR-6973b-3p,miR-7063-3p
##  3:                                                     miR-7032-3p
##  4:                                                                
##  5:                                                                
##  6:                                                     miR-7218-3p
##  7:                                                     miR-7062-3p
##  8:                   miR-1894-5p,miR-3473c,miR-6975-3p,miR-7649-3p
##  9:                                             miR-343,miR-6976-3p
## 10: miR-150-5p,miR-1894-5p,miR-343,miR-5127,miR-6976-3p,miR-7045-3p
# 7mers
hf.specific.peaks.k7 <- findEnrichedKmersPeaks(hf.specific.peaks, k7.background.hf, k=7)
ggplot(hf.specific.peaks.k7[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 7mers in all HF-specific peaks")

pdf("PDF_figure/Kmer_HF_specific_peaks_k7.pdf",
    width = 5,
    height = 4)
ggplot(hf.specific.peaks.k7[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 7mers in all HF-specific peaks")
dev.off()
## quartz_off_screen 
##                 2
hf.specific.peaks.k7[1:10,]
##        kmer   enrich                               miR
##  1: GAGAGAG 6.546272                                  
##  2: AGAGAGA 6.437620                                  
##  3: AGAGAGG 4.052991                                  
##  4: GTTAGTA 3.881307                                  
##  5: TTAGTAC 3.863916                                  
##  6: GGTTAGT 3.788374                                  
##  7: TAGTACT 3.754561                                  
##  8: GAGAGGG 3.702571                                  
##  9: TGGTTAG 3.701170                                  
## 10: GGAGAGA 3.675163 miR-3473c,miR-6975-3p,miR-7649-3p
# 8mers
hf.specific.peaks.k8 <- findEnrichedKmersPeaks(hf.specific.peaks, k8.background.hf, k=8)
ggplot(hf.specific.peaks.k8[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 8mers in all HF-specific peaks")

pdf("PDF_figure/Kmer_HF_specific_peaks_k8.pdf",
    width = 5,
    height = 4)
ggplot(hf.specific.peaks.k8[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 8mers in all HF-specific peaks")
dev.off()
## quartz_off_screen 
##                 2
hf.specific.peaks.k8[1:10,]
##         kmer   enrich miR
##  1: AGAGAGAG 6.809019    
##  2: GAGAGAGA 6.799836    
##  3: GAGAGAGG 4.844114    
##  4: GGAGAGAG 4.364336    
##  5: GGTTAGTA 4.183465    
##  6: GAGAGGGA 4.142659    
##  7: AGAGAGGG 4.117766    
##  8: GTTAGTAC 4.080454    
##  9: TTAGTACT 4.060624    
## 10: TAGTACTT 4.055144

HOMER run

hf.specific.peaks.seq <- get.seqs(bsgenome, hf.specific.peaks)

## HOMER run of K7 kmer
prepareHOMERinput(kmer.results = hf.specific.peaks.k7, 
                  peaks = hf.specific.peaks, 
                  peaks.seq = hf.specific.peaks.seq,
                  filename.tag = "hf.specific",
                  n.kmers = 50,
                  width = 15)
## [1] "info on k-mer positions (count and width distribution):"
## [1] 224
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    15.0    16.0    32.5    35.3    54.0    86.0 
## [1] "info on background positions (count and width distribution):"
## [1] 889
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   15.00   16.00   33.00   35.44   54.00   86.00
genomic.regions <- "Kmer/hf.specific-peaks-k7-windows.bed"
background.regions <- "Kmer/hf.specific-peaks-k7-windows-background.bed"
homerdir <- "Kmer/homer-denovo-output-hf.specific-peaks-k7"
dir.create(homerdir, showWarnings = FALSE)
homer.cmd <- sprintf("findMotifsGenome.pl %s mm10 %s -bg %s -len 8 -size given -rna -noweight -minlp -5 -nlen 2 -N 200000 -bits -p 10 -cache 1000 >.homer-output 2>.err.homer-output",
                     genomic.regions, homerdir, background.regions)
## Run in terminal
## system(homer.cmd)
print(homer.cmd)
## [1] "findMotifsGenome.pl Kmer/hf.specific-peaks-k7-windows.bed mm10 Kmer/homer-denovo-output-hf.specific-peaks-k7 -bg Kmer/hf.specific-peaks-k7-windows-background.bed -len 8 -size given -rna -noweight -minlp -5 -nlen 2 -N 200000 -bits -p 10 -cache 1000 >.homer-output 2>.err.homer-output"
hf.specific.peaks.homer.res <- loadHomerResults(homerdir)
human.mirnas <-
    sapply(strsplit(hf.specific.peaks.homer.res[[1]]$best.match, " "), "[", 1)
print(human.mirnas)
## [1] "hsa-miR-4667-3p" "hsa-miR-3691-3p" "hsa-miR-27b*"    "hsa-miR-629*"   
## [5] "hsa-miR-3183"    "hsa-miR-4640-3p" "hsa-miR-4753-3p"
mouse.mirnas <- c("miR-4667-3p", "miR-26/1297/4465", "miR-21", "miR-629", "miR-3183/4723/6769", "miR-4640-3p", "hsa-miR-4753-3p")
hf.specific.peaks.homer.res[[1]]$best.match.simple <- mouse.mirnas
plotHomerResults(hf.specific.peaks.homer.res[[1]],
                 hf.specific.peaks.homer.res[[2]], n.motifs = 7)

pdf("PDF_figure/Homer_HF_specific_peaks.pdf",
    width = 6,
    height = 8)
plotHomerResults(hf.specific.peaks.homer.res[[1]],
                 hf.specific.peaks.homer.res[[2]], n.motifs = 7)
dev.off()
## quartz_off_screen 
##                 2
Hit HF-specific peaks
# obtain GRanges subjects for hit HF peaks
hit.hf.index <- c()
for (i in 1:length(hit_hf_peak_list$name)) {
  hit.hf.index <- c(hit.hf.index, grep(paste("^", hit_hf_peak_list$name[i], "$", sep =""), hf.specific.peaks$name))
}

hf.hit.peaks <- hf.specific.peaks[hit.hf.index]

# 6mer
hf.hit.peaks.k6 <- findEnrichedKmersPeaks(hf.hit.peaks, k6.background.hf, k=6)
ggplot(hf.hit.peaks.k6[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 6mers in hit HF-specific peaks")

pdf("PDF_figure/Kmer_hit_HF_peaks_k6.pdf",
    width = 5,
    height = 4)
ggplot(hf.hit.peaks.k6[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 6mers in hit HF-specific peaks")
dev.off()
## quartz_off_screen 
##                 2
hf.hit.peaks.k6[1:10,]
##       kmer   enrich
##  1: AGGAGG 3.276685
##  2: GAGGGA 3.172056
##  3: GAGGAG 2.842822
##  4: AGAGAG 2.837563
##  5: GAGAGA 2.767335
##  6: GAGAGG 2.632297
##  7: GGAGGG 2.611487
##  8: GGAGGA 2.606314
##  9: AGGGAA 2.511339
## 10: AGAGGA 2.454626
##                                                                        miR
##  1:                                                                       
##  2:                                                                       
##  3:                                                            miR-6919-3p
##  4:                                      miR-1896,miR-6973b-3p,miR-7063-3p
##  5:                          miR-3473c,miR-6896-3p,miR-6975-3p,miR-7649-3p
##  6:                                                            miR-7032-3p
##  7:                                                                       
##  8:                                                            miR-6919-3p
##  9:                                      miR-204-5p,miR-211-5p,miR-7670-3p
## 10: miR-3059-5p,miR-5616-5p,miR-6961-3p,miR-7032-3p,miR-7650-5p,miR-877-3p
# 7mers
hf.hit.peaks.k7 <- findEnrichedKmersPeaks(hf.hit.peaks, k7.background.hf, k=7)
ggplot(hf.hit.peaks.k7[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 7mers in hit HF-specific peaks")

pdf("PDF_figure/Kmer_hit_HF_peaks_k7.pdf",
    width = 5,
    height = 4)
ggplot(hf.hit.peaks.k7[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 7mers in hit HF-specific peaks")
dev.off()
## quartz_off_screen 
##                 2
hf.hit.peaks.k7[1:10,]
##        kmer   enrich miR
##  1: GAGGAGG 3.308519    
##  2: GGAGGGA 3.224847    
##  3: GAGAGAG 3.059716    
##  4: AGGAGGG 2.879872    
##  5: AGGAGGA 2.879394    
##  6: AGAGGAG 2.873821    
##  7: GAGGGAA 2.834131    
##  8: AGAGAGA 2.672140    
##  9: GAGGGAG 2.641521    
## 10: AAGGAGG 2.616016
# 8mers
hf.hit.peaks.k8 <- findEnrichedKmersPeaks(hf.hit.peaks, k8.background.hf, k=8)
ggplot(hf.hit.peaks.k8[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 8mers in hit HF-specific peaks")

pdf("PDF_figure/Kmer_hit_HF_peaks_k8.pdf",
    width = 5,
    height = 4)
ggplot(hf.hit.peaks.k8[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 8mers in hit HF-specific peaks")
dev.off()
## quartz_off_screen 
##                 2
hf.hit.peaks.k8[1:10,]
##         kmer   enrich miR
##  1: AGAGGAGG 3.082909    
##  2: GAGGAGGA 3.021367    
##  3: AGGAGGGA 3.014233    
##  4: AGAGAGAG 2.798164    
##  5: GGAGGGAA 2.652681    
##  6: GAAGGAGG 2.628484    
##  7: GAGAGAGG 2.613564    
##  8: AAGAGGAG 2.584180    
##  9: GGAGGGAG 2.541196    
## 10: AAGGAGGG 2.446382

HOMER run

hf.hit.peaks.seq <- get.seqs(bsgenome, hf.hit.peaks)

## HOMER run of K7 kmer
prepareHOMERinput(kmer.results = hf.hit.peaks.k7, 
                  peaks = hf.hit.peaks, 
                  peaks.seq = hf.hit.peaks.seq,
                  filename.tag = "hf.hit",
                  n.kmers = 50,
                  width = 15)
## [1] "info on k-mer positions (count and width distribution):"
## [1] 46
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   15.00   15.00   17.00   25.57   30.00   61.00 
## [1] "info on background positions (count and width distribution):"
## [1] 179
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   15.00   15.00   17.00   25.78   30.00   61.00
genomic.regions <- "Kmer/hf.hit-peaks-k7-windows.bed"
background.regions <- "Kmer/hf.hit-peaks-k7-windows-background.bed"
homerdir <- "Kmer/homer-denovo-output-hf.hit-peaks-k7"
dir.create(homerdir, showWarnings = FALSE)
homer.cmd <- sprintf("findMotifsGenome.pl %s mm10 %s -bg %s -len 8 -size given -rna -noweight -minlp -5 -nlen 2 -N 200000 -bits -p 10 -cache 1000 >.homer-output 2>.err.homer-output",
                     genomic.regions, homerdir, background.regions)
## Run in terminal
## system(homer.cmd)
print(homer.cmd)
## [1] "findMotifsGenome.pl Kmer/hf.hit-peaks-k7-windows.bed mm10 Kmer/homer-denovo-output-hf.hit-peaks-k7 -bg Kmer/hf.hit-peaks-k7-windows-background.bed -len 8 -size given -rna -noweight -minlp -5 -nlen 2 -N 200000 -bits -p 10 -cache 1000 >.homer-output 2>.err.homer-output"
hf.hit.peaks.homer.res <- loadHomerResults(homerdir)
human.mirnas <-
    sapply(strsplit(hf.hit.peaks.homer.res[[1]]$best.match, " "), "[", 1)
print(human.mirnas)
## [1] "hsa-miR-4768-5p" "hsa-miR-4448"    "hsa-miR-4495"    "hsa-miR-877*"   
## [5] "hsa-miR-761"
mouse.mirnas <- c("miR-4768/6833-3p", "miR-4448", "miR-4495", "miR-877", "miR-195")
hf.hit.peaks.homer.res[[1]]$best.match.simple <- mouse.mirnas
plotHomerResults(hf.hit.peaks.homer.res[[1]],
                 hf.hit.peaks.homer.res[[2]], n.motifs = 5)

pdf("PDF_figure/Homer_hit_HF_peaks.pdf",
    width = 6,
    height = 6)
plotHomerResults(hf.hit.peaks.homer.res[[1]],
                 hf.hit.peaks.homer.res[[2]], n.motifs = 5)
dev.off()
## quartz_off_screen 
##                 2
Background kmer enrichment in transcripts with HF peaks

As a sanity check, we need to compare the above enrichment with enrichment of k-mers in whole transcript sequences (because the composition of peak position is ~50% 3'UTR, ~30% exon and ~20% intron).

mm10.withpeaks.hf <- subsetByOverlaps(mm10.transcript, peaks.all.hf)
mm10.withpeaks.hf.seq <- get.seqs(bsgenome, mm10.withpeaks.hf)

# 6mer
transcript.hf.k6 <- findEnrichedKmersSeqs(mm10.withpeaks.hf.seq, k6.background.hf)
ggplot(transcript.hf.k6[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(transcript / background)") +
  labs(title = "Background enriched 6mers in all transcripts with HF peaks")

pdf("PDF_figure/Kmer_background_HF_peaks_k6.pdf",
    width = 5,
    height = 4)
ggplot(transcript.hf.k6[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(transcript / background)") +
  labs(title = "Background enriched 6mers in all transcripts with HF peaks")
dev.off()
## quartz_off_screen 
##                 2
# 7mer
transcript.hf.k7 <- findEnrichedKmersSeqs(mm10.withpeaks.hf.seq, k7.background.hf, k=7)
ggplot(transcript.hf.k7[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(transcript / background)") +
  labs(title = "Background enriched 7mers in all transcripts with HF peaks")

pdf("PDF_figure/Kmer_background_HF_peaks_k7.pdf",
    width = 5,
    height = 4)
ggplot(transcript.hf.k7[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(transcript / background)") +
  labs(title = "Background enriched 7mers in all transcripts with HF peaks")
dev.off()
## quartz_off_screen 
##                 2
# 8mer
transcript.hf.k8 <- findEnrichedKmersSeqs(mm10.withpeaks.hf.seq, k8.background.hf, k=8)
ggplot(transcript.hf.k8[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(transcript / background)") +
  labs(title = "Background enriched 8mers in all transcripts with HF peaks")

pdf("PDF_figure/Kmer_background_HF_peaks_k8.pdf",
    width = 5,
    height = 4)
ggplot(transcript.hf.k8[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(transcript / background)") +
  labs(title = "Background enriched 8mers in all transcripts with HF peaks")
dev.off()
## quartz_off_screen 
##                 2

HFK peaks

All HFK peaks

Now we look at that are the enriched kmers in the all HFK peaks.

# calculate all the kmer backgrounds
k6.background.hfk <- calculateKmerBackground(k=6, genomeTag = "mm10", gr.overlap = peaks.all.hfk,
                                         exons.only = TRUE)
k7.background.hfk <- calculateKmerBackground(k=7, genomeTag = "mm10", gr.overlap = peaks.all.hfk,
                                         exons.only = TRUE)
k8.background.hfk <- calculateKmerBackground(k=8, genomeTag = "mm10", gr.overlap = peaks.all.hfk,
                                         exons.only = TRUE)

# 6mer
hfk.peaks.k6 <- findEnrichedKmersPeaks(hfk.peaks, k6.background.hfk, k=6)
ggplot(hfk.peaks.k6[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 6mers in all HFK peaks")

pdf("PDF_figure/Kmer_all_HFK_peaks_k6.pdf",
    width = 5,
    height = 4)
ggplot(hfk.peaks.k6[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 6mers in all HFK peaks")
dev.off()
## quartz_off_screen 
##                 2
hfk.peaks.k6[1:10,]
##       kmer   enrich
##  1: TACCTC 2.622092
##  2: CTACCT 2.422006
##  3: TGTTAC 2.347290
##  4: CAGTAT 2.016787
##  5: CGCACA 1.989222
##  6: AGTATT 1.932105
##  7: TCTACC 1.887613
##  8: GTATTA 1.856389
##  9: GTTACA 1.825499
## 10: ACGCAC 1.817874
##                                                                                                                                                        miR
##  1:                            let-7a-5p,let-7b-5p,let-7c-5p,let-7d-5p,let-7e-5p,let-7f-5p,let-7g-5p,let-7i-5p,let-7j,let-7k,miR-1961,miR-202-3p,miR-98-5p
##  2: let-7a-5p,let-7b-5p,let-7c-5p,let-7d-5p,let-7e-5p,let-7f-5p,let-7g-5p,let-7i-5p,let-7k,miR-1839-5p,miR-1961,miR-196a-5p,miR-196b-5p,miR-3962,miR-98-5p
##  3:                                                                                                                                             miR-194-5p
##  4:                                                                                                                     miR-200b-3p,miR-200c-3p,miR-429-3p
##  5:                                                                                                                                  miR-147-3p,miR-210-3p
##  6:                                                                                                                     miR-200b-3p,miR-200c-3p,miR-429-3p
##  7:                                                                                                         miR-1193-5p,miR-1839-5p,miR-376a-5p,miR-379-5p
##  8:                                                                                              miR-200b-3p,miR-200c-3p,miR-369-3p,miR-374c-5p,miR-429-3p
##  9:                                                                                                                       miR-194-5p,miR-379-3p,miR-411-3p
## 10:                                                                                                                                             miR-210-3p
# 7mers
hfk.peaks.k7 <- findEnrichedKmersPeaks(hfk.peaks, k7.background.hfk, k=7)
ggplot(hfk.peaks.k7[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 7mers in all HFK peaks")

pdf("PDF_figure/Kmer_all_HFK_peaks_k7.pdf",
    width = 5,
    height = 4)
ggplot(hfk.peaks.k7[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 7mers in all HFK peaks")
dev.off()
## quartz_off_screen 
##                 2
hfk.peaks.k7[1:10,]
##        kmer   enrich
##  1: CTACCTC 3.325645
##  2: CTGTTAC 3.073933
##  3: TACCTCA 2.964520
##  4: CGCACAA 2.962052
##  5: AGTATTA 2.854151
##  6: TCTACCT 2.759571
##  7: CAGTATT 2.734688
##  8: GGTGCTA 2.686393
##  9: ACTACCT 2.552687
## 10: ATCTACC 2.545345
##                                                                                                           miR
##  1: let-7a-5p,let-7b-5p,let-7c-5p,let-7d-5p,let-7e-5p,let-7f-5p,let-7g-5p,let-7i-5p,let-7k,miR-1961,miR-98-5p
##  2:                                                                                                miR-194-5p
##  3:    let-7a-5p,let-7b-5p,let-7c-5p,let-7e-5p,let-7f-5p,let-7g-5p,let-7i-5p,let-7j,let-7k,miR-1961,miR-98-5p
##  4:                                                                                                          
##  5:                                                                        miR-200b-3p,miR-200c-3p,miR-429-3p
##  6:                                                                                               miR-1839-5p
##  7:                                                                        miR-200b-3p,miR-200c-3p,miR-429-3p
##  8:                                                                          miR-29a-3p,miR-29b-3p,miR-29c-3p
##  9:                                                                          miR-196a-5p,miR-196b-5p,miR-3962
## 10:                                                                                               miR-376a-5p
# 8mers
hfk.peaks.k8 <- findEnrichedKmersPeaks(hfk.peaks, k8.background.hfk, k=8)
ggplot(hfk.peaks.k8[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 8mers in all HFK peaks")

pdf("PDF_figure/Kmer_all_HFK_peaks_k8.pdf",
    width = 5,
    height = 4)
ggplot(hfk.peaks.k8[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 8mers in all HFK peaks")
dev.off()
## quartz_off_screen 
##                 2
hfk.peaks.k8[1:10,]
##         kmer   enrich
##  1: TCTACCTC 3.604939
##  2: CTACCTCA 3.587241
##  3: CAGTATTA 3.397272
##  4: TCTGTTAC 3.340069
##  5: CTGTTACA 3.323493
##  6: ACTACCTC 3.253249
##  7: ATCTACCT 3.168790
##  8: GCTGTTAC 3.161519
##  9: TACCTCAG 3.143018
## 10: TGGTGCTA 3.128837
##                                                                                                 miR
##  1:                                                                                                
##  2: let-7a-5p,let-7b-5p,let-7c-5p,let-7e-5p,let-7f-5p,let-7g-5p,let-7i-5p,let-7k,miR-1961,miR-98-5p
##  3:                                                              miR-200b-3p,miR-200c-3p,miR-429-3p
##  4:                                                                                                
##  5:                                                                                      miR-194-5p
##  6:                                                                                                
##  7:                                                                                                
##  8:                                                                                                
##  9:                                                                                                
## 10:                                                                miR-29a-3p,miR-29b-3p,miR-29c-3p

HOMER run

all.hfk.peaks.seq <- get.seqs(bsgenome, hfk.peaks)

## HOMER run of K7 kmer
prepareHOMERinput(kmer.results = hfk.peaks.k7, 
                  peaks = hfk.peaks, 
                  peaks.seq = all.hfk.peaks.seq,
                  filename.tag = "all.hfk",
                  n.kmers = 50,
                  width = 15)
## [1] "info on k-mer positions (count and width distribution):"
## [1] 2783
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   15.00   15.00   16.00   16.81   17.00   54.00 
## [1] "info on background positions (count and width distribution):"
## [1] 10269
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   15.00   15.00   16.00   16.92   17.00   54.00
genomic.regions <- "Kmer/all.hfk-peaks-k7-windows.bed"
background.regions <- "Kmer/all.hfk-peaks-k7-windows-background.bed"
homerdir <- "Kmer/homer-denovo-output-all.hfk-peaks-k7"
dir.create(homerdir, showWarnings = FALSE)
homer.cmd <- sprintf("findMotifsGenome.pl %s mm10 %s -bg %s -len 8 -size given -rna -noweight -minlp -5 -nlen 2 -N 200000 -bits -p 10 -cache 1000 >.homer-output 2>.err.homer-output",
                     genomic.regions, homerdir, background.regions)
## Run in terminal
## system(homer.cmd)
print(homer.cmd)
## [1] "findMotifsGenome.pl Kmer/all.hfk-peaks-k7-windows.bed mm10 Kmer/homer-denovo-output-all.hfk-peaks-k7 -bg Kmer/all.hfk-peaks-k7-windows-background.bed -len 8 -size given -rna -noweight -minlp -5 -nlen 2 -N 200000 -bits -p 10 -cache 1000 >.homer-output 2>.err.homer-output"
all.hfk.peaks.homer.res <- loadHomerResults(homerdir)
human.mirnas <-
    sapply(strsplit(all.hfk.peaks.homer.res[[1]]$best.match, " "), "[", 1)
print(human.mirnas)
##  [1] "hsa-miR-4500"    "hsa-miR-195"     "hsa-miR-200b"    "hsa-miR-24"     
##  [5] "hsa-miR-106b"    "hsa-miR-194"     "hsa-miR-1297"    "hsa-miR-147b"   
##  [9] "hsa-miR-32"      "hsa-miR-22"      "hsa-miR-4772-3p" "hsa-miR-4707-3p"
mouse.mirnas <- c("let-7/miR-98", "miR-29", "miR-200/429", "miR-24", "miR-17/20/93/106/519/526", "miR-194", "miR-26/1297/4465", "miR-147b", "miR-25-3p/32/92-3p/363-3p/367-3p", "miR-22", "hsa-miR-4772-3p", "hsa-miR-4707-3p")
all.hfk.peaks.homer.res[[1]]$best.match.simple <- mouse.mirnas
plotHomerResults(all.hfk.peaks.homer.res[[1]],
                 all.hfk.peaks.homer.res[[2]], n.motifs = 8)

pdf("PDF_figure/Homer_all_HFK_peaks.pdf",
    width = 6,
    height = 10)
plotHomerResults(all.hfk.peaks.homer.res[[1]],
                 all.hfk.peaks.homer.res[[2]], n.motifs = 8)
dev.off()
## quartz_off_screen 
##                 2
All HFK-specific peaks
# 6mer
hfk.specific.peaks.k6 <- findEnrichedKmersPeaks(hfk.specific.peaks, k6.background.hfk, k=6)
ggplot(hfk.specific.peaks.k6[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 6mers in all HFK-specific peaks")

pdf("PDF_figure/Kmer_HFK_specific_peaks_k6.pdf",
    width = 5,
    height = 4)
ggplot(hfk.specific.peaks.k6[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 6mers in all HFK-specific peaks")
dev.off()
## quartz_off_screen 
##                 2
hfk.specific.peaks.k6[1:10,]
##       kmer   enrich
##  1: TACCTC 2.429711
##  2: TGTTAC 2.315375
##  3: CTACCT 2.271077
##  4: CAGTAT 2.002503
##  5: AGTATT 1.927065
##  6: GTATTA 1.864020
##  7: CGCACA 1.857613
##  8: CTGTTA 1.846135
##  9: GTTACA 1.758898
## 10: TCTACC 1.743318
##                                                                                                                                                        miR
##  1:                            let-7a-5p,let-7b-5p,let-7c-5p,let-7d-5p,let-7e-5p,let-7f-5p,let-7g-5p,let-7i-5p,let-7j,let-7k,miR-1961,miR-202-3p,miR-98-5p
##  2:                                                                                                                                             miR-194-5p
##  3: let-7a-5p,let-7b-5p,let-7c-5p,let-7d-5p,let-7e-5p,let-7f-5p,let-7g-5p,let-7i-5p,let-7k,miR-1839-5p,miR-1961,miR-196a-5p,miR-196b-5p,miR-3962,miR-98-5p
##  4:                                                                                                                     miR-200b-3p,miR-200c-3p,miR-429-3p
##  5:                                                                                                                     miR-200b-3p,miR-200c-3p,miR-429-3p
##  6:                                                                                              miR-200b-3p,miR-200c-3p,miR-369-3p,miR-374c-5p,miR-429-3p
##  7:                                                                                                                                  miR-147-3p,miR-210-3p
##  8:                                                                                                           miR-132-3p,miR-194-5p,miR-212-3p,miR-6997-5p
##  9:                                                                                                                       miR-194-5p,miR-379-3p,miR-411-3p
## 10:                                                                                                         miR-1193-5p,miR-1839-5p,miR-376a-5p,miR-379-5p
# 7mers
hfk.specific.peaks.k7 <- findEnrichedKmersPeaks(hfk.specific.peaks, k7.background.hfk, k=7)
ggplot(hfk.specific.peaks.k7[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 7mers in all HFK-specific peaks")

pdf("PDF_figure/Kmer_HFK_specific_peaks_k7.pdf",
    width = 5,
    height = 4)
ggplot(hfk.specific.peaks.k7[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 7mers in all HFK-specific peaks")
dev.off()
## quartz_off_screen 
##                 2
hfk.specific.peaks.k7[1:10,]
##        kmer   enrich
##  1: CTACCTC 3.135311
##  2: CTGTTAC 3.097413
##  3: AGTATTA 2.825961
##  4: TACCTCA 2.727877
##  5: CAGTATT 2.726627
##  6: CGCACAA 2.673223
##  7: GGTGCTA 2.664634
##  8: TCTACCT 2.577624
##  9: TGTTACA 2.496617
## 10: ATCTACC 2.477707
##                                                                                                           miR
##  1: let-7a-5p,let-7b-5p,let-7c-5p,let-7d-5p,let-7e-5p,let-7f-5p,let-7g-5p,let-7i-5p,let-7k,miR-1961,miR-98-5p
##  2:                                                                                                miR-194-5p
##  3:                                                                        miR-200b-3p,miR-200c-3p,miR-429-3p
##  4:    let-7a-5p,let-7b-5p,let-7c-5p,let-7e-5p,let-7f-5p,let-7g-5p,let-7i-5p,let-7j,let-7k,miR-1961,miR-98-5p
##  5:                                                                        miR-200b-3p,miR-200c-3p,miR-429-3p
##  6:                                                                                                          
##  7:                                                                          miR-29a-3p,miR-29b-3p,miR-29c-3p
##  8:                                                                                               miR-1839-5p
##  9:                                                                                                miR-194-5p
## 10:                                                                                               miR-376a-5p
# 8mers
hfk.specific.peaks.k8 <- findEnrichedKmersPeaks(hfk.specific.peaks, k8.background.hfk, k=8)
ggplot(hfk.specific.peaks.k8[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 8mers in all HFK-specific peaks")

pdf("PDF_figure/Kmer_HFK_specific_peaks_k8.pdf",
    width = 5,
    height = 4)
ggplot(hfk.specific.peaks.k8[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 8mers in all HFK-specific peaks")
dev.off()
## quartz_off_screen 
##                 2
hfk.specific.peaks.k8[1:10,]
##         kmer   enrich
##  1: TCTGTTAC 3.394824
##  2: TCTACCTC 3.391763
##  3: CTACCTCA 3.387699
##  4: CAGTATTA 3.324135
##  5: CTGTTACA 3.294427
##  6: GCTGTTAC 3.245336
##  7: ACTACCTC 3.119942
##  8: TGGTGCTA 3.100228
##  9: ATCTACCT 3.040603
## 10: GCAGTATT 3.013702
##                                                                                                 miR
##  1:                                                                                                
##  2:                                                                                                
##  3: let-7a-5p,let-7b-5p,let-7c-5p,let-7e-5p,let-7f-5p,let-7g-5p,let-7i-5p,let-7k,miR-1961,miR-98-5p
##  4:                                                              miR-200b-3p,miR-200c-3p,miR-429-3p
##  5:                                                                                      miR-194-5p
##  6:                                                                                                
##  7:                                                                                                
##  8:                                                                miR-29a-3p,miR-29b-3p,miR-29c-3p
##  9:                                                                                                
## 10:

HOMER run

hfk.specific.peaks.seq <- get.seqs(bsgenome, hfk.specific.peaks)

## HOMER run of K7 kmer
prepareHOMERinput(kmer.results = hfk.specific.peaks.k7, 
                  peaks = hfk.specific.peaks, 
                  peaks.seq = hfk.specific.peaks.seq,
                  filename.tag = "hfk.specific",
                  n.kmers = 50,
                  width = 15)
## [1] "info on k-mer positions (count and width distribution):"
## [1] 2067
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   15.00   15.00   16.00   16.78   17.00   54.00 
## [1] "info on background positions (count and width distribution):"
## [1] 7766
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   15.00   15.00   16.00   16.87   17.00   54.00
genomic.regions <- "Kmer/hfk.specific-peaks-k7-windows.bed"
background.regions <- "Kmer/hfk.specific-peaks-k7-windows-background.bed"
homerdir <- "Kmer/homer-denovo-output-hfk.specific-peaks-k7"
dir.create(homerdir, showWarnings = FALSE)
homer.cmd <- sprintf("findMotifsGenome.pl %s mm10 %s -bg %s -len 8 -size given -rna -noweight -minlp -5 -nlen 2 -N 200000 -bits -p 10 -cache 1000 >.homer-output 2>.err.homer-output",
                     genomic.regions, homerdir, background.regions)
## Run in terminal
## system(homer.cmd)
print(homer.cmd)
## [1] "findMotifsGenome.pl Kmer/hfk.specific-peaks-k7-windows.bed mm10 Kmer/homer-denovo-output-hfk.specific-peaks-k7 -bg Kmer/hfk.specific-peaks-k7-windows-background.bed -len 8 -size given -rna -noweight -minlp -5 -nlen 2 -N 200000 -bits -p 10 -cache 1000 >.homer-output 2>.err.homer-output"
hfk.specific.peaks.homer.res <- loadHomerResults(homerdir)
human.mirnas <-
    sapply(strsplit(hfk.specific.peaks.homer.res[[1]]$best.match, " "), "[", 1)
print(human.mirnas)
## [1] "hsa-miR-4500" "hsa-miR-29c"  "hsa-miR-200b" "hsa-miR-379*" "hsa-miR-106b"
## [6] "hsa-miR-191"  "hsa-miR-1297" "hsa-miR-22"   "hsa-miR-32"
mouse.mirnas <- c("let-7/miR-98", "miR-29", "miR-200/429", "miR-194", "miR-17/20/93/106/519/526", "miR-191", "miR-26/1297/4465", "miR-22", "miR-25-3p/32/92-3p/363-3p/367-3p")
hfk.specific.peaks.homer.res[[1]]$best.match.simple <- mouse.mirnas
plotHomerResults(hfk.specific.peaks.homer.res[[1]],
                 hfk.specific.peaks.homer.res[[2]], n.motifs = 8)

pdf("PDF_figure/Homer_HFK_specific_peaks.pdf",
    width = 6,
    height = 10)
plotHomerResults(hfk.specific.peaks.homer.res[[1]],
                 hfk.specific.peaks.homer.res[[2]], n.motifs = 8)
dev.off()
## quartz_off_screen 
##                 2
Hit HFK-specific peaks
# obtain GRanges subjects for hit HF peaks
hit.hfk.index <- c()
for (i in 1:length(hit_hfk_peak_list$name)) {
  hit.hfk.index <- c(hit.hfk.index, grep(paste("^", hit_hfk_peak_list$name[i], "$", sep =""), hfk.specific.peaks$name))
}

hfk.hit.peaks <- hfk.specific.peaks[hit.hfk.index]

# 6mer
hfk.hit.peaks.k6 <- findEnrichedKmersPeaks(hfk.hit.peaks, k6.background.hfk, k=6)
ggplot(hfk.hit.peaks.k6[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 6mers in hit HFK-specific peaks")

pdf("PDF_figure/Kmer_hit_HFK_peaks_k6.pdf",
    width = 5,
    height = 4)
ggplot(hfk.hit.peaks.k6[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 6mers in hit HFK-specific peaks")
dev.off()
## quartz_off_screen 
##                 2
hfk.hit.peaks.k6[1:10,]
##       kmer   enrich
##  1: TACCTC 2.387491
##  2: ACCTCA 2.196542
##  3: CTACCT 2.142770
##  4: GTATTA 2.104202
##  5: CAGTAT 2.027791
##  6: TAACGC 1.980771
##  7: TATCCT 1.927117
##  8: GTGCTA 1.919341
##  9: TGTTAC 1.806391
## 10: AATGCT 1.713355
##                                                                                                                                                        miR
##  1:                            let-7a-5p,let-7b-5p,let-7c-5p,let-7d-5p,let-7e-5p,let-7f-5p,let-7g-5p,let-7i-5p,let-7j,let-7k,miR-1961,miR-202-3p,miR-98-5p
##  2:                          let-7a-5p,let-7b-5p,let-7c-5p,let-7e-5p,let-7f-5p,let-7g-5p,let-7i-5p,let-7j,let-7k,miR-1961,miR-672-5p,miR-7674-5p,miR-98-5p
##  3: let-7a-5p,let-7b-5p,let-7c-5p,let-7d-5p,let-7e-5p,let-7f-5p,let-7g-5p,let-7i-5p,let-7k,miR-1839-5p,miR-1961,miR-196a-5p,miR-196b-5p,miR-3962,miR-98-5p
##  4:                                                                                              miR-200b-3p,miR-200c-3p,miR-369-3p,miR-374c-5p,miR-429-3p
##  5:                                                                                                                     miR-200b-3p,miR-200c-3p,miR-429-3p
##  6:                                                                                                                                                       
##  7:                                                                                                                                                       
##  8:                                                                                                     miR-29a-3p,miR-29b-3p,miR-29c-3p,miR-6345,miR-6357
##  9:                                                                                                                                             miR-194-5p
## 10:                                                                                                                                            miR-1955-3p
# 7mers
hfk.hit.peaks.k7 <- findEnrichedKmersPeaks(hfk.hit.peaks, k7.background.hfk, k=7)
ggplot(hfk.hit.peaks.k7[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 7mers in hit HFK-specific peaks")

pdf("PDF_figure/Kmer_hit_HFK_peaks_k7.pdf",
    width = 5,
    height = 4)
ggplot(hfk.hit.peaks.k7[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 7mers in hit HFK-specific peaks")
dev.off()
## quartz_off_screen 
##                 2
hfk.hit.peaks.k7[1:10,]
##        kmer   enrich
##  1: CTACCTC 2.882893
##  2: TACCTCA 2.875163
##  3: GGTGCTA 2.340570
##  4: AGTATTA 2.261557
##  5: TGCCTGC 2.246395
##  6: TCAGTAT 2.243950
##  7: CAGTATT 2.183994
##  8: CTGTTAC 2.160833
##  9: ATGCTGC 2.149861
## 10: GTATTAT 2.141404
##                                                                                                           miR
##  1: let-7a-5p,let-7b-5p,let-7c-5p,let-7d-5p,let-7e-5p,let-7f-5p,let-7g-5p,let-7i-5p,let-7k,miR-1961,miR-98-5p
##  2:    let-7a-5p,let-7b-5p,let-7c-5p,let-7e-5p,let-7f-5p,let-7g-5p,let-7i-5p,let-7j,let-7k,miR-1961,miR-98-5p
##  3:                                                                          miR-29a-3p,miR-29b-3p,miR-29c-3p
##  4:                                                                        miR-200b-3p,miR-200c-3p,miR-429-3p
##  5:                                                                                                          
##  6:                                                                                                          
##  7:                                                                        miR-200b-3p,miR-200c-3p,miR-429-3p
##  8:                                                                                                miR-194-5p
##  9:                                                                                     miR-103-3p,miR-107-3p
## 10:                                                                                    miR-369-3p,miR-374c-5p
# 8mers
hfk.hit.peaks.k8 <- findEnrichedKmersPeaks(hfk.hit.peaks, k8.background.hfk, k=8)
ggplot(hfk.hit.peaks.k8[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 8mers in hit HFK-specific peaks")

pdf("PDF_figure/Kmer_hit_HFK_peaks_k8.pdf",
    width = 5,
    height = 4)
ggplot(hfk.hit.peaks.k8[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 8mers in hit HFK-specific peaks")
dev.off()
## quartz_off_screen 
##                 2
hfk.hit.peaks.k8[1:10,]
##         kmer   enrich
##  1: CTACCTCA 3.058864
##  2: TACCTCAG 2.484778
##  3: GCCCCTGG 2.414468
##  4: TACCTCAT 2.212203
##  5: TGGTGCTA 2.209404
##  6: CTGAGCCA 2.161589
##  7: ATGGTGCT 2.159933
##  8: TGTTACAG 2.149165
##  9: CCTCTGAG 2.096503
## 10: CTGTGCCA 2.088767
##                                                                                                 miR
##  1: let-7a-5p,let-7b-5p,let-7c-5p,let-7e-5p,let-7f-5p,let-7g-5p,let-7i-5p,let-7k,miR-1961,miR-98-5p
##  2:                                                                                                
##  3:                                                                                                
##  4:                                                                                                
##  5:                                                                miR-29a-3p,miR-29b-3p,miR-29c-3p
##  6:                                         miR-24-3p,miR-5124b,miR-6361,miR-6369,miR-6410,miR-6413
##  7:                                                                                                
##  8:                                                                                                
##  9:                                                                                                
## 10:

HOMER run

hfk.hit.peaks.seq <- get.seqs(bsgenome, hfk.hit.peaks)

## HOMER run of K7 kmer
prepareHOMERinput(kmer.results = hfk.hit.peaks.k7, 
                  peaks = hfk.hit.peaks, 
                  peaks.seq = hfk.hit.peaks.seq,
                  filename.tag = "hfk.hit",
                  n.kmers = 50,
                  width = 15)
## [1] "info on k-mer positions (count and width distribution):"
## [1] 159
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   15.00   15.00   15.00   18.23   18.00   54.00 
## [1] "info on background positions (count and width distribution):"
## [1] 600
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    15.0    15.0    16.0    18.5    18.0    54.0
genomic.regions <- "Kmer/hfk.hit-peaks-k7-windows.bed"
background.regions <- "Kmer/hfk.hit-peaks-k7-windows-background.bed"
homerdir <- "Kmer/homer-denovo-output-hfk.hit-peaks-k7"
dir.create(homerdir, showWarnings = FALSE)
homer.cmd <- sprintf("findMotifsGenome.pl %s mm10 %s -bg %s -len 8 -size given -rna -noweight -minlp -5 -nlen 2 -N 200000 -bits -p 10 -cache 1000 >.homer-output 2>.err.homer-output",
                     genomic.regions, homerdir, background.regions)
## Run in terminal
## system(homer.cmd)
print(homer.cmd)
## [1] "findMotifsGenome.pl Kmer/hfk.hit-peaks-k7-windows.bed mm10 Kmer/homer-denovo-output-hfk.hit-peaks-k7 -bg Kmer/hfk.hit-peaks-k7-windows-background.bed -len 8 -size given -rna -noweight -minlp -5 -nlen 2 -N 200000 -bits -p 10 -cache 1000 >.homer-output 2>.err.homer-output"
hfk.hit.peaks.homer.res <- loadHomerResults(homerdir)
human.mirnas <-
    sapply(strsplit(hfk.hit.peaks.homer.res[[1]]$best.match, " "), "[", 1)
print(human.mirnas)
## [1] "hsa-miR-202"     "hsa-miR-195"     "hsa-miR-183"     "hsa-miR-200b"   
## [5] "hsa-miR-486-5p"  "hsa-miR-520a-5p" "hsa-miR-214"
mouse.mirnas <- c("let-7/miR-98", "miR-29", "miR-183", "miR-200/429", "miR-486", "miR-520/525", "miR-214-3p/761/3619")
hfk.hit.peaks.homer.res[[1]]$best.match.simple <- mouse.mirnas
plotHomerResults(hfk.hit.peaks.homer.res[[1]],
                 hfk.hit.peaks.homer.res[[2]], n.motifs = 7)

pdf("PDF_figure/Homer_hit_HFK_peaks.pdf",
    width = 6,
    height = 8)
plotHomerResults(hfk.hit.peaks.homer.res[[1]],
                 hfk.hit.peaks.homer.res[[2]], n.motifs = 7)
dev.off()
## quartz_off_screen 
##                 2
Weird HFK-specific peaks
# obtain GRanges subjects for hit HF peaks
weird.hfk.index <- c()
for (i in 1:length(weird_hfk_peak_list$name)) {
  weird.hfk.index <- c(weird.hfk.index, grep(paste("^", weird_hfk_peak_list$name[i], "$", sep =""), hfk.specific.peaks$name))
}

hfk.weird.peaks <- hfk.specific.peaks[weird.hfk.index]

# 6mer
hfk.weird.peaks.k6 <- findEnrichedKmersPeaks(hfk.weird.peaks, k6.background.hfk, k=6)
ggplot(hfk.weird.peaks.k6[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 6mers in weird HFK-specific peaks")

pdf("PDF_figure/Kmer_weird_HFK_peaks_k6.pdf",
    width = 5,
    height = 4)
ggplot(hfk.weird.peaks.k6[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 6mers in weird HFK-specific peaks")
dev.off()
## quartz_off_screen 
##                 2
hfk.weird.peaks.k6[1:10,]
##       kmer   enrich
##  1: TACCTC 2.261597
##  2: TGTTAC 2.133140
##  3: CTACCT 2.096110
##  4: GTTACC 1.910602
##  5: GTATTA 1.889515
##  6: GTTACA 1.873526
##  7: CCTACC 1.829442
##  8: ACCTAC 1.812755
##  9: AGTATT 1.799015
## 10: CGCACA 1.790724
##                                                                                                                                                        miR
##  1:                            let-7a-5p,let-7b-5p,let-7c-5p,let-7d-5p,let-7e-5p,let-7f-5p,let-7g-5p,let-7i-5p,let-7j,let-7k,miR-1961,miR-202-3p,miR-98-5p
##  2:                                                                                                                                             miR-194-5p
##  3: let-7a-5p,let-7b-5p,let-7c-5p,let-7d-5p,let-7e-5p,let-7f-5p,let-7g-5p,let-7i-5p,let-7k,miR-1839-5p,miR-1961,miR-196a-5p,miR-196b-5p,miR-3962,miR-98-5p
##  4:                                                                                                                                                       
##  5:                                                                                              miR-200b-3p,miR-200c-3p,miR-369-3p,miR-374c-5p,miR-429-3p
##  6:                                                                                                                       miR-194-5p,miR-379-3p,miR-411-3p
##  7:                                                                                                                                            miR-7224-5p
##  8:                                                                                                                               miR-6932-5p,miR-6973b-5p
##  9:                                                                                                                     miR-200b-3p,miR-200c-3p,miR-429-3p
## 10:                                                                                                                                  miR-147-3p,miR-210-3p
# 7mers
hfk.weird.peaks.k7 <- findEnrichedKmersPeaks(hfk.weird.peaks, k7.background.hfk, k=7)
ggplot(hfk.weird.peaks.k7[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 7mers in weird HFK-specific peaks")

pdf("PDF_figure/Kmer_weird_HFK_peaks_k7.pdf",
    width = 5,
    height = 4)
ggplot(hfk.weird.peaks.k7[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 7mers in weird HFK-specific peaks")
dev.off()
## quartz_off_screen 
##                 2
hfk.weird.peaks.k7[1:10,]
##        kmer   enrich
##  1: CTACCTC 2.836011
##  2: CTGTTAC 2.722882
##  3: AGTATTA 2.638295
##  4: TGTTACA 2.533839
##  5: ACCTACC 2.462115
##  6: CCTACCT 2.451164
##  7: TACCTCA 2.322902
##  8: GGTGCTA 2.317842
##  9: GCAGTAT 2.266959
## 10: TCTACCT 2.226000
##                                                                                                           miR
##  1: let-7a-5p,let-7b-5p,let-7c-5p,let-7d-5p,let-7e-5p,let-7f-5p,let-7g-5p,let-7i-5p,let-7k,miR-1961,miR-98-5p
##  2:                                                                                                miR-194-5p
##  3:                                                                        miR-200b-3p,miR-200c-3p,miR-429-3p
##  4:                                                                                                miR-194-5p
##  5:                                                                                                          
##  6:                                                                                                          
##  7:    let-7a-5p,let-7b-5p,let-7c-5p,let-7e-5p,let-7f-5p,let-7g-5p,let-7i-5p,let-7j,let-7k,miR-1961,miR-98-5p
##  8:                                                                          miR-29a-3p,miR-29b-3p,miR-29c-3p
##  9:                                                                                                          
## 10:                                                                                               miR-1839-5p
# 8mers
hfk.weird.peaks.k8 <- findEnrichedKmersPeaks(hfk.weird.peaks, k8.background.hfk, k=8)
ggplot(hfk.weird.peaks.k8[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 8mers in weird HFK-specific peaks")

pdf("PDF_figure/Kmer_weird_HFK_peaks_k8.pdf",
    width = 5,
    height = 4)
ggplot(hfk.weird.peaks.k8[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(peaks / background)") +
  labs(title = "Enriched 8mers in weird HFK-specific peaks")
dev.off()
## quartz_off_screen 
##                 2
hfk.weird.peaks.k8[1:10,]
##         kmer   enrich
##  1: CTGTTACA 3.079737
##  2: TCTACCTC 3.043032
##  3: CAGTATTA 2.858762
##  4: CCTACCTC 2.825131
##  5: ACCTACCT 2.803865
##  6: CTACCTCA 2.784516
##  7: ACTGTTAC 2.748970
##  8: TGGTGCTA 2.696374
##  9: CTACCTCT 2.687698
## 10: GCAGTATT 2.653731
##                                                                                                 miR
##  1:                                                                                      miR-194-5p
##  2:                                                                                                
##  3:                                                              miR-200b-3p,miR-200c-3p,miR-429-3p
##  4:                                                                                                
##  5:                                                                                                
##  6: let-7a-5p,let-7b-5p,let-7c-5p,let-7e-5p,let-7f-5p,let-7g-5p,let-7i-5p,let-7k,miR-1961,miR-98-5p
##  7:                                                                                                
##  8:                                                                miR-29a-3p,miR-29b-3p,miR-29c-3p
##  9:                                                                                       let-7d-5p
## 10:

HOMER run

hfk.weird.peaks.seq <- get.seqs(bsgenome, hfk.weird.peaks)

## HOMER run of K7 kmer
prepareHOMERinput(kmer.results = hfk.weird.peaks.k7, 
                  peaks = hfk.weird.peaks, 
                  peaks.seq = hfk.weird.peaks.seq,
                  filename.tag = "hfk.weird",
                  n.kmers = 50,
                  width = 15)
## [1] "info on k-mer positions (count and width distribution):"
## [1] 356
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   15.00   15.00   16.00   16.98   17.00   34.00 
## [1] "info on background positions (count and width distribution):"
## [1] 1323
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   15.00   15.00   16.00   17.06   17.00   44.00
genomic.regions <- "Kmer/hfk.weird-peaks-k7-windows.bed"
background.regions <- "Kmer/hfk.weird-peaks-k7-windows-background.bed"
homerdir <- "Kmer/homer-denovo-output-hfk.weird-peaks-k7"
dir.create(homerdir, showWarnings = FALSE)
homer.cmd <- sprintf("findMotifsGenome.pl %s mm10 %s -bg %s -len 8 -size given -rna -noweight -minlp -5 -nlen 2 -N 200000 -bits -p 10 -cache 1000 >.homer-output 2>.err.homer-output",
                     genomic.regions, homerdir, background.regions)
## Run in terminal
## system(homer.cmd)
print(homer.cmd)
## [1] "findMotifsGenome.pl Kmer/hfk.weird-peaks-k7-windows.bed mm10 Kmer/homer-denovo-output-hfk.weird-peaks-k7 -bg Kmer/hfk.weird-peaks-k7-windows-background.bed -len 8 -size given -rna -noweight -minlp -5 -nlen 2 -N 200000 -bits -p 10 -cache 1000 >.homer-output 2>.err.homer-output"
hfk.weird.peaks.homer.res <- loadHomerResults(homerdir)
human.mirnas <-
    sapply(strsplit(hfk.weird.peaks.homer.res[[1]]$best.match, " "), "[", 1)
print(human.mirnas)
##  [1] "hsa-miR-4500" "hsa-miR-194"  "hsa-miR-200b" "hsa-miR-29c"  "hsa-miR-24"  
##  [6] "hsa-miR-210"  "hsa-miR-4736" "hsa-miR-498"  "hsa-miR-5047" "hsa-miR-4469"
## [11] "hsa-miR-3681" "hsa-miR-3131"
mouse.mirnas <- c("let-7/miR-98", "miR-194", "miR-200/429", "miR-29", "miR-24", "miR-210", "miR-4736", "miR-498", "hsa-miR-5047", "hsa-miR-3682", "hsa-miR-3681", "hsa-miR-3131")
hfk.weird.peaks.homer.res[[1]]$best.match.simple <- mouse.mirnas
plotHomerResults(hfk.weird.peaks.homer.res[[1]],
                 hfk.weird.peaks.homer.res[[2]], n.motifs = 8)

pdf("PDF_figure/Homer_weird_HFK_peaks.pdf",
    width = 6,
    height = 10)
plotHomerResults(hfk.weird.peaks.homer.res[[1]],
                 hfk.weird.peaks.homer.res[[2]], n.motifs = 8)
dev.off()
## quartz_off_screen 
##                 2
Background kmer enrichment in transcripts with HFK peaks

As a sanity check, we need to compare the above enrichment with enrichment of k-mers in whole transcript sequences (because the composition of peak position is ~50% 3'UTR, ~30% exon and ~20% intron).

mm10.withpeaks.hfk <- subsetByOverlaps(mm10.transcript, peaks.all.hfk)
mm10.withpeaks.hfk.seq <- get.seqs(bsgenome, mm10.withpeaks.hfk)

# 6mer
transcript.hfk.k6 <- findEnrichedKmersSeqs(mm10.withpeaks.hfk.seq, k6.background.hfk)
ggplot(transcript.hfk.k6[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(transcript / background)") +
  labs(title = "Background enriched 6mers in all transcripts with HFK peaks")

pdf("PDF_figure/Kmer_background_HFK_peaks_k6.pdf",
    width = 5,
    height = 4)
ggplot(transcript.hfk.k6[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(transcript / background)") +
  labs(title = "Background enriched 6mers in all transcripts with HFK peaks")
dev.off()
## quartz_off_screen 
##                 2
# 7mer
transcript.hfk.k7 <- findEnrichedKmersSeqs(mm10.withpeaks.hfk.seq, k7.background.hfk, k=7)
ggplot(transcript.hfk.k7[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(transcript / background)") +
  labs(title = "Background enriched 7mers in all transcripts with HFK peaks")

pdf("PDF_figure/Kmer_background_HFK_peaks_k7.pdf",
    width = 5,
    height = 4)
ggplot(transcript.hfk.k7[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(transcript / background)") +
  labs(title = "Background enriched 7mers in all transcripts with HFK peaks")
dev.off()
## quartz_off_screen 
##                 2
# 8mer
transcript.hfk.k8 <- findEnrichedKmersSeqs(mm10.withpeaks.hfk.seq, k8.background.hfk, k=8)
ggplot(transcript.hfk.k8[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(transcript / background)") +
  labs(title = "Background enriched 8mers in all transcripts with HFK peaks")

pdf("PDF_figure/Kmer_background_HFK_peaks_k8.pdf",
    width = 5,
    height = 4)
ggplot(transcript.hfk.k8[1:10, ],
       aes(x = reorder(kmer, enrich), y = enrich)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Frequency\nlog2(transcript / background)") +
  labs(title = "Background enriched 8mers in all transcripts with HFK peaks")
dev.off()
## quartz_off_screen 
##                 2

Associate kmer to peaks

Associate all HF and HFK specific HEAP peaks to miRNAs

# HF
hf.specific.peak.seq <- get.seqs(bsgenome, hf.specific.peaks)

hf.peak.info <- as.data.frame(hf.specific.peaks)
hf.peak.info$targeting_mirna_6mer<- NA
hf.peak.info$targeting_mirna_7mer<- NA
hf.peak.info$targeting_mirna_8mer<- NA

# 6mer
for (i in 1:dim(hf.specific.peaks.k6)[1]) {
  six_mer <- DNAString(hf.specific.peaks.k6$kmer[i])
  mindex <- vmatchPattern(six_mer, hf.specific.peak.seq)
  nmatch_per_seq <- elementNROWS(mindex)
  for (n in 1:length(nmatch_per_seq)) {
    if (nmatch_per_seq[n] > 0) {
      if (is.na(hf.peak.info$targeting_mirna_6mer[n])) {
        hf.peak.info$targeting_mirna_6mer[n] <- hf.specific.peaks.k6$miR[i]
      }
      else {
        hf.peak.info$targeting_mirna_6mer[n] <- paste(hf.peak.info$targeting_mirna_6mer[n], hf.specific.peaks.k6$miR[i], sep="")
      }
    }
  }
}

# 7mer
for (i in 1:dim(hf.specific.peaks.k7)[1]) {
  seven_mer <- DNAString(hf.specific.peaks.k7$kmer[i])
  mindex <- vmatchPattern(seven_mer, hf.specific.peak.seq)
  nmatch_per_seq <- elementNROWS(mindex)
  for (n in 1:length(nmatch_per_seq)) {
    if (nmatch_per_seq[n] > 0) {
      if (is.na(hf.peak.info$targeting_mirna_7mer[n])) {
        hf.peak.info$targeting_mirna_7mer[n] <- hf.specific.peaks.k7$miR[i]
      }
      else {
        hf.peak.info$targeting_mirna_7mer[n] <- paste(hf.peak.info$targeting_mirna_7mer[n], hf.specific.peaks.k7$miR[i], sep="")
      }
    }
  }
}

# 8mer
for (i in 1:dim(hf.specific.peaks.k8)[1]) {
  eight_mer <- DNAString(hf.specific.peaks.k8$kmer[i])
  mindex <- vmatchPattern(eight_mer, hf.specific.peak.seq)
  nmatch_per_seq <- elementNROWS(mindex)
  for (n in 1:length(nmatch_per_seq)) {
    if (nmatch_per_seq[n] > 0) {
      if (is.na(hf.peak.info$targeting_mirna_8mer[n])) {
        hf.peak.info$targeting_mirna_8mer[n] <- hf.specific.peaks.k8$miR[i]
      }
      else {
        hf.peak.info$targeting_mirna_8mer[n] <- paste(hf.peak.info$targeting_mirna_8mer[n], hf.specific.peaks.k8$miR[i], sep="")
      }
    }
  }
}

write.csv(hf.peak.info, "HF_uniq_peak_mirna_match.csv")

# HFK
hfk.specific.peak.seq <- get.seqs(bsgenome, hfk.specific.peaks)

hfk.peak.info <- as.data.frame(hfk.specific.peaks)
hfk.peak.info$targeting_mirna_6mer<- NA
hfk.peak.info$targeting_mirna_7mer<- NA
hfk.peak.info$targeting_mirna_8mer<- NA

# 6mer
for (i in 1:dim(hfk.specific.peaks.k6)[1]) {
  six_mer <- DNAString(hfk.specific.peaks.k6$kmer[i])
  mindex <- vmatchPattern(six_mer, hfk.specific.peak.seq)
  nmatch_per_seq <- elementNROWS(mindex)
  for (n in 1:length(nmatch_per_seq)) {
    if (nmatch_per_seq[n] > 0) {
      if (is.na(hfk.peak.info$targeting_mirna_6mer[n])) {
        hfk.peak.info$targeting_mirna_6mer[n] <- hfk.specific.peaks.k6$miR[i]
      }
      else {
        hfk.peak.info$targeting_mirna_6mer[n] <- paste(hfk.peak.info$targeting_mirna_6mer[n], hfk.specific.peaks.k6$miR[i], sep="")
      }
    }
  }
}

# 7mer
for (i in 1:dim(hfk.specific.peaks.k7)[1]) {
  seven_mer <- DNAString(hfk.specific.peaks.k7$kmer[i])
  mindex <- vmatchPattern(seven_mer, hfk.specific.peak.seq)
  nmatch_per_seq <- elementNROWS(mindex)
  for (n in 1:length(nmatch_per_seq)) {
    if (nmatch_per_seq[n] > 0) {
      if (is.na(hfk.peak.info$targeting_mirna_7mer[n])) {
        hfk.peak.info$targeting_mirna_7mer[n] <- hfk.specific.peaks.k7$miR[i]
      }
      else {
        hfk.peak.info$targeting_mirna_7mer[n] <- paste(hfk.peak.info$targeting_mirna_7mer[n], hfk.specific.peaks.k7$miR[i], sep="")
      }
    }
  }
}

# 8mer
for (i in 1:dim(hfk.specific.peaks.k8)[1]) {
  eight_mer <- DNAString(hfk.specific.peaks.k8$kmer[i])
  mindex <- vmatchPattern(eight_mer, hfk.specific.peak.seq)
  nmatch_per_seq <- elementNROWS(mindex)
  for (n in 1:length(nmatch_per_seq)) {
    if (nmatch_per_seq[n] > 0) {
      if (is.na(hfk.peak.info$targeting_mirna_8mer[n])) {
        hfk.peak.info$targeting_mirna_8mer[n] <- hfk.specific.peaks.k8$miR[i]
      }
      else {
        hfk.peak.info$targeting_mirna_8mer[n] <- paste(hfk.peak.info$targeting_mirna_8mer[n], hfk.specific.peaks.k8$miR[i], sep="")
      }
    }
  }
}

write.csv(hfk.peak.info, "HFK_uniq_peak_mirna_match.csv")

Match hit HEAP targets with corresponding miRNAs

# HF
hit_hf_mirna_index <- c()
for (i in 1:dim(hit_hf_peak_list)[1]) {
  hit_hf_mirna_index <- c(hit_hf_mirna_index, grep(paste("^", hit_hf_peak_list$name[i], "$", sep = ""), hf.peak.info$name))
}
hit_hf_target_miRNA <- hf.peak.info[hit_hf_mirna_index,]
write.csv(hit_hf_target_miRNA, "hit_HF_mirna_match.csv")

# HFK
hit_hfk_mirna_index <- c()
for (i in 1:dim(hit_hfk_peak_list)[1]) {
  hit_hfk_mirna_index <- c(hit_hfk_mirna_index, grep(paste("^", hit_hfk_peak_list$name[i], "$", sep = ""), hfk.peak.info$name))
}
hit_hfk_target_miRNA <- hfk.peak.info[hit_hfk_mirna_index,]
write.csv(hit_hfk_target_miRNA, "hit_HFK_mirna_match.csv")

Match weird HFK targets with corresponding miRNAs

werid_hfk_mirna_index <- c()
for (i in 1:dim(weird_hfk_peak_list)[1]) {
  werid_hfk_mirna_index <- c(werid_hfk_mirna_index, grep(paste("^", weird_hfk_peak_list$name[i], "$", sep = ""), hfk.peak.info$name))
}
weird_hfk_target_miRNA <- hfk.peak.info[werid_hfk_mirna_index,]
write.csv(weird_hfk_target_miRNA, "weird_HFK_mirna_match.csv")

Cross-analysis of kmers enriched in HF and HFK samples

I want to find out what are the kmers that are only enriched in HF samples or only enriched in HFK samples.

I am only using the 7mer analysis result for HF and HFK specific peaks here. I will expand later.

# Specific peaks
hf_specific_7mer <- as.data.frame(setdiff(hf.specific.peaks.k7$kmer, hfk.specific.peaks.k7$kmer))
colnames(hf_specific_7mer)[1] <- "HF_specific_7mer"
hfk_specific_7mer <- as.data.frame(setdiff(hfk.specific.peaks.k7$kmer, hf.specific.peaks.k7$kmer))
colnames(hfk_specific_7mer)[1] <- "HFK_specific_7mer"

hf_specific_7mer <- inner_join(hf_specific_7mer, hf.specific.peaks.k7, by = c("HF_specific_7mer" = "kmer"))

hfk_specific_7mer <- inner_join(hfk_specific_7mer, hfk.specific.peaks.k7, by = c("HFK_specific_7mer" = "kmer"))

# output the results into csv files
write_csv(hf_specific_7mer, "HF_specific_7mer.csv")
write_csv(hfk_specific_7mer, "HFK_specific_7mer.csv")

# hit peaks
hf_hit_7mer <- as.data.frame(setdiff(hf.hit.peaks.k7$kmer, hfk.hit.peaks.k7$kmer))
colnames(hf_hit_7mer)[1] <- "HF_hit_7mer"
hfk_hit_7mer <- as.data.frame(setdiff(hfk.hit.peaks.k7$kmer, hf.hit.peaks.k7$kmer))
colnames(hfk_hit_7mer)[1] <- "HFK_hit_7mer"

hf_hit_7mer <- inner_join(hf_hit_7mer, hf.hit.peaks.k7, by = c("HF_hit_7mer" = "kmer"))

hfk_hit_7mer <- inner_join(hfk_hit_7mer, hfk.hit.peaks.k7, by = c("HFK_hit_7mer" = "kmer"))

# output the results into csv files
write_csv(hf_hit_7mer, "HF_hit_7mer.csv")
write_csv(hfk_hit_7mer, "HFK_hit_7mer.csv")

Overlap this with the list of miRNAs that are up/down-regulated by KRas in the literature that I compiled.

Cross-analysis of kmers enriched in hit HFK and weird HFK samples

I want to see if there are kmers that are unique to the weird HFK peaks.

# Specific peaks
weird_hfk_specific_7mer <- as.data.frame(setdiff(hfk.weird.peaks.k7$kmer, hfk.hit.peaks.k7$kmer))
colnames(weird_hfk_specific_7mer)[1] <- "weird_HFK_specific_7mer"

weird_hfk_specific_7mer <- inner_join(weird_hfk_specific_7mer, hfk.weird.peaks.k7, by = c("weird_HFK_specific_7mer" = "kmer"))

# output the results into csv files
write_csv(weird_hfk_specific_7mer, "Weird_HFK_specific_7mer.csv")

Overlap all peaks with TargetScan predicted targets

For all peaks detected in the HF and HFK HEAP-CLIP analysis with a padj < 0.05.

# get a GRanges object that have all peaks with padj < 0.05
sig.peaks <- subset(all.peaks, padj < padj.threshold)

# load TargetScan predictions
targetscan.all <- import("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/HEAP-CLIP/HEAP_12232019/Analysis/CLIP_Analysis/Data Visualization/Predicted_Targets/Targets_CS_pctiles.mm10.sorted.broadConsFam.consSite.bed")
targetscan.all$miR <- sapply(strsplit(targetscan.all$name, ":"), "[", 2)
targetscan.all$gene <-sapply(strsplit(targetscan.all$name, ":"), "[", 1)
targetscan <- subset(targetscan.all, score > 50)
targetscan <- subset(targetscan, width > 1 & width < 12)
sig.peaks.targetscan <- subset(targetscan, overlapsAny(targetscan, sig.peaks))
sig.peaks.targetscan.count <- as.data.frame(sort(table(sig.peaks.targetscan$miR),
                                                  decreasing = TRUE))
ggplot(sig.peaks.targetscan.count, aes(x = Var1, y = Freq)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Number of TargetScan predictions\noverlapping with significant HEAP-CLIP peaks") +
  scale_x_discrete(limits = rev(levels(sig.peaks.targetscan.count$Var1)))

pdf("PDF_figure/Overlap_with_TargetScan.pdf",
    width = 6,
    height = 5)
ggplot(sig.peaks.targetscan.count, aes(x = Var1, y = Freq)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Number of TargetScan predictions\noverlapping with significant HEAP-CLIP peaks") +
  scale_x_discrete(limits = rev(levels(sig.peaks.targetscan.count$Var1)))
dev.off()
## quartz_off_screen 
##                 2
# what percentage of all peaks are predicted by TargetScan
paste(sum(sig.peaks.targetscan.count$Freq)/length(sig.peaks)*100, "%", sep = "")
## [1] "8.84988114829036%"

For hit HEAP peaks detected in the HEAP-CLIP analysis with a padj < 0.05 and with decreased protein expression in the proteomics

all.hit.peaks <- c(hf.hit.peaks, hfk.hit.peaks)
all.hit.peaks.targetscan <- subset(targetscan, overlapsAny(targetscan, all.hit.peaks))
all.hit.eaks.targetscan.count <- as.data.frame(sort(table(all.hit.peaks.targetscan$miR),
                                                  decreasing = TRUE))
ggplot(all.hit.eaks.targetscan.count, aes(x = Var1, y = Freq)) +
  geom_col() + coord_flip() + theme_bw() + xlab("") +
  ylab("Number of TargetScan predictions\noverlapping with all hit HEAP peaks") +
  scale_x_discrete(limits = rev(levels(all.hit.eaks.targetscan.count$Var1)))

# what percentage of all hit peaks are predicted by TargetScan
paste(sum(all.hit.eaks.targetscan.count$Freq)/length(all.hit.peaks)*100, "%", sep = "")
## [1] "8.29268292682927%"

GO Analysis

Load libraries necessary for the analysis

library(gridExtra)
library(grid)
library(ggplot2)
library(lattice)
library(readr)
library(dplyr)
library(clusterProfiler)
library(DOSE)
library(pathview)

Generate reference files as input for GO analysis

In order to perform GO analysis, I need to generate a universe file that contain all peaks detected in HF and HFK with their targets Ensembl ID.

all.peaks.go <- as.data.frame(all.peaks, row.names = NULL)

# annotate the list with potential target gene and Ensembl ID
all.peaks.go$'target_gene' <- NA
for (i in 1:dim(all.peaks.go)) {
  if (!is.na(all.peaks.go[i,12]) | !is.na(all.peaks.go[i,13])) {
    gene_name <- unique(unlist(all.peaks.go[i,12:13]))
    gene_name <- gene_name[!is.na(gene_name)] 
    all.peaks.go$'target_gene'[i] <- paste(unlist(gene_name), collapse = " ")
  }
  else {
    gene_name <- unique(unlist(all.peaks.go[i,8:11]))
    gene_name <- gene_name[!is.na(gene_name)]
    if (length(gene_name) >0) {
    all.peaks.go$'target_gene'[i] <- paste(unlist(gene_name), collapse = " ")
    }
  }
}
## Warning in 1:dim(all.peaks.go): numerical expression has 2 elements: only the
## first used
all_gene_list <- as.data.frame(table(all.peaks.go$target_gene))

# Ensembl IDs are annotated using `EnsDb.Mmusculus.v79` package since that is the one that I used for RNA-Seq analysis
annotations_all <- AnnotationDbi::select(EnsDb.Mmusculus.v79,
                                           keys = as.character(all_gene_list$Var1),
                                           columns = c("GENEID"),
                                           keytype = "GENENAME")

# Determine the indices for the non-duplicated genes
non_duplicates_idx <- which(duplicated(annotations_all$GENENAME) == FALSE)
non_duplicates_idx <- which(duplicated(annotations_all$GENEID) == FALSE)

# Return only the non-duplicated genes using indices
annotations_all <- annotations_all[non_duplicates_idx, ]

# Check number of NAs returned
is.na(annotations_all$GENENAME) %>%
  which() %>%
  length()
## [1] 0
# annotate the dataset with Ensembl ID and Uniprot ID from BioMart
all.peaks.go <- inner_join(all.peaks.go, annotations_all, by=c("target_gene"="GENENAME"))

# how many genes have Ensembl ID
sum(!is.na(annotations_all$GENEID))
## [1] 2515

Overlap targets

detected_gene <- as.character(unique(all.peaks.go$GENEID))

# annotate the list with potential target gene and Ensembl ID
overlap.peaks$'target_gene' <- NA
for (i in 1:dim(overlap.peaks)) {
  if (!is.na(overlap.peaks[i,12]) | !is.na(overlap.peaks[i,13])) {
    gene_name <- unique(unlist(overlap.peaks[i,12:13]))
    gene_name <- gene_name[!is.na(gene_name)] 
    overlap.peaks$'target_gene'[i] <- paste(unlist(gene_name), collapse = " ")
  }
  else {
    gene_name <- unique(unlist(overlap.peaks[i,8:11]))
    gene_name <- gene_name[!is.na(gene_name)]
    if (length(gene_name) >0) {
    overlap.peaks$'target_gene'[i] <- paste(unlist(gene_name), collapse = " ")
    }
  }
}
## Warning in 1:dim(overlap.peaks): numerical expression has 2 elements: only the
## first used
overlap_gene_list <- as.data.frame(table(overlap.peaks$target_gene))
# annotate the overlap peak list with Ensembl ID
annotations_overlap <- AnnotationDbi::select(EnsDb.Mmusculus.v79,
                                           keys = as.character(overlap_gene_list$Var1),
                                           columns = c("GENEID"),
                                           keytype = "GENENAME")

# Determine the indices for the non-duplicated genes
non_duplicates_idx <- which(duplicated(annotations_overlap$GENENAME) == FALSE)
non_duplicates_idx <- which(duplicated(annotations_overlap$GENEID) == FALSE)

# Return only the non-duplicated genes using indices
annotations_overlap <- annotations_overlap[non_duplicates_idx, ]

# Check number of NAs returned
is.na(annotations_overlap$GENENAME) %>%
  which() %>%
  length()
## [1] 0
# annotate the dataset with Ensembl ID and Uniprot ID from BioMart
overlap.peaks <- inner_join(overlap.peaks, annotations_overlap, by=c("target_gene"="GENENAME"))

# GO analysis
target_gene <- as.character(unique(overlap.peaks$GENEID))

# Run GO enrichment analysis for biological process
ego_BP <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "ENSEMBL",
                OrgDb = org.Mm.eg.db, 
                ont = "BP", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_BP <- data.frame(ego_BP)

write.csv(cluster_summary_BP, "GO Analysis/Overlap_GO analysis_BP.csv")

# Run GO enrichment analysis for molecular function 
ego_MF <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "ENSEMBL",
                OrgDb = org.Mm.eg.db, 
                ont = "MF", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_MF <- data.frame(ego_MF)

write.csv(cluster_summary_MF, "GO Analysis/Overlap_GO analysis_MF.csv")

# Run GO enrichment analysis for cellular component 
ego_CC <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "ENSEMBL",
                OrgDb = org.Mm.eg.db, 
                ont = "CC", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_CC <- data.frame(ego_CC)

write.csv(cluster_summary_CC, "GO Analysis/Overlap_GO analysis_CC.csv")

No enrichment was found.

HF

all HF

hf.peaks.go <- as.data.frame(hf.peaks)
# annotate the list with potential target gene and Ensembl ID
hf.peaks.go$'target_gene' <- NA
for (i in 1:dim(hf.peaks.go)) {
  if (!is.na(hf.peaks.go[i,12]) | !is.na(hf.peaks.go[i,13])) {
    gene_name <- unique(unlist(hf.peaks.go[i,12:13]))
    gene_name <- gene_name[!is.na(gene_name)] 
    hf.peaks.go$'target_gene'[i] <- paste(unlist(gene_name), collapse = " ")
  }
  else {
    gene_name <- unique(unlist(hf.peaks.go[i,8:11]))
    gene_name <- gene_name[!is.na(gene_name)]
    if (length(gene_name) >0) {
    hf.peaks.go$'target_gene'[i] <- paste(unlist(gene_name), collapse = " ")
    }
  }
}
## Warning in 1:dim(hf.peaks.go): numerical expression has 2 elements: only the
## first used
hf_gene_list <- as.data.frame(table(hf.peaks.go$target_gene))
# annotate the overlap peak list with Ensembl ID
annotations_hf_all <- AnnotationDbi::select(EnsDb.Mmusculus.v79,
                                           keys = as.character(hf_gene_list$Var1),
                                           columns = c("GENEID"),
                                           keytype = "GENENAME")

# Determine the indices for the non-duplicated genes
non_duplicates_idx <- which(duplicated(annotations_hf_all$GENENAME) == FALSE)
non_duplicates_idx <- which(duplicated(annotations_hf_all$GENEID) == FALSE)

# Return only the non-duplicated genes using indices
annotations_hf_all <- annotations_hf_all[non_duplicates_idx, ]

# Check number of NAs returned
is.na(annotations_hf_all$GENENAME) %>%
  which() %>%
  length()
## [1] 0
# annotate the dataset with Ensembl ID and Uniprot ID from BioMart
hf.peaks.go <- inner_join(hf.peaks.go, annotations_hf_all, by=c("target_gene"="GENENAME"))

# GO analysis
target_gene <- as.character(unique(hf.peaks.go$GENEID))

# Run GO enrichment analysis for biological process
ego_BP <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "ENSEMBL",
                OrgDb = org.Mm.eg.db, 
                ont = "BP", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_BP <- data.frame(ego_BP)

write.csv(cluster_summary_BP, "GO Analysis/HF_GO analysis_BP.csv")

# Run GO enrichment analysis for molecular function 
ego_MF <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "ENSEMBL",
                OrgDb = org.Mm.eg.db, 
                ont = "MF", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_MF <- data.frame(ego_MF)

write.csv(cluster_summary_MF, "GO Analysis/HF_GO analysis_MF.csv")

# Run GO enrichment analysis for cellular component 
ego_CC <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "ENSEMBL",
                OrgDb = org.Mm.eg.db, 
                ont = "CC", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_CC <- data.frame(ego_CC)

write.csv(cluster_summary_CC, "GO Analysis/HF_GO analysis_CC.csv")
Cellular Component
png('GO Analysis/HF_GO dotplot_CC.png',
    width = 800,
    height = 800,
    res = 100,
    pointsize = 8)
dotplot(ego_CC, showCategory=41)
dev.off()
## quartz_off_screen 
##                 2
pdf("PDF_figure/GO_all_HF_peaks_CC_dotplot.pdf",
    width = 8,
    height = 4)
dotplot(ego_CC, showCategory=41)
dev.off()
## quartz_off_screen 
##                 2

Final output for Cellular Component is CC_dotplot

png('GO Analysis/HF_GO enrichment_CC.png',
    width = 800,
    height = 800,
    res = 100,
    pointsize = 8)
emapplot(ego_CC, showCategory = 50)
dev.off()
## quartz_off_screen 
##                 2
pdf("PDF_figure/GO_all_HF_peaks_CC_emap.pdf",
    width = 8,
    height = 8)
emapplot(ego_CC, showCategory = 50)
dev.off()
## quartz_off_screen 
##                 2

Final output is MF_enrichplot

HF-specific

# GO analysis
target_gene <- as.character(unique(hf.uniq.peaks$GENEID))

# Run GO enrichment analysis for biological process
ego_BP <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "ENSEMBL",
                OrgDb = org.Mm.eg.db, 
                ont = "BP", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_BP <- data.frame(ego_BP)

write.csv(cluster_summary_BP, "GO Analysis/HF_specific_GO analysis_BP.csv")

# Run GO enrichment analysis for molecular function 
ego_MF <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "ENSEMBL",
                OrgDb = org.Mm.eg.db, 
                ont = "MF", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_MF <- data.frame(ego_MF)

write.csv(cluster_summary_MF, "GO Analysis/HF_specific_GO analysis_MF.csv")

# Run GO enrichment analysis for cellular component 
ego_CC <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "ENSEMBL",
                OrgDb = org.Mm.eg.db, 
                ont = "CC", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_CC <- data.frame(ego_CC)

write.csv(cluster_summary_CC, "GO Analysis/HF_specific_GO analysis_CC.csv")
Cellular Component
png('GO Analysis/HF_specific_GO dotplot_CC.png',
    width = 1000,
    height = 1000,
    res = 100,
    pointsize = 8)
dotplot(ego_CC, showCategory=41)
dev.off()
## quartz_off_screen 
##                 2
pdf("PDF_figure/GO_HF_specific_peaks_CC_dotplot.pdf",
    width = 8,
    height = 8)
dotplot(ego_CC, showCategory=41)
dev.off()
## quartz_off_screen 
##                 2

Final output for Cellular Component is CC_dotplot

png('GO Analysis/HF_specific_GO enrichment_CC.png',
    width = 1200,
    height = 1200,
    res = 100,
    pointsize = 8)
emapplot(ego_CC, showCategory = 50)
dev.off()
## quartz_off_screen 
##                 2
pdf("PDF_figure/GO_HF_specific_peaks_CC_emap.pdf",
    width = 8,
    height = 8)
emapplot(ego_CC, showCategory = 50)
dev.off()
## quartz_off_screen 
##                 2

Final output is MF_enrichplot

hit HF

# GO analysis
target_gene <- as.character(unique(hit_hf_peak_list$GENEID))

# Run GO enrichment analysis for biological process
ego_BP <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "ENSEMBL",
                OrgDb = org.Mm.eg.db, 
                ont = "BP", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_BP <- data.frame(ego_BP)

write.csv(cluster_summary_BP, "GO Analysis/HF_hit_GO analysis_BP.csv")

# Run GO enrichment analysis for molecular function 
ego_MF <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "ENSEMBL",
                OrgDb = org.Mm.eg.db, 
                ont = "MF", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_MF <- data.frame(ego_MF)

write.csv(cluster_summary_MF, "GO Analysis/HF_hit_GO analysis_MF.csv")

# Run GO enrichment analysis for cellular component 
ego_CC <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "ENSEMBL",
                OrgDb = org.Mm.eg.db, 
                ont = "CC", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_CC <- data.frame(ego_CC)

write.csv(cluster_summary_CC, "GO Analysis/HF_hit_GO analysis_CC.csv")
Biological Process
png('GO Analysis/HF_hit_GO enrichment_BP.png',
    width = 800,
    height = 300,
    res = 100,
    pointsize = 8)
dotplot(ego_BP, showCategory = 50)
dev.off()
## quartz_off_screen 
##                 2
pdf("PDF_figure/GO_hit_HF_peaks_BP_dotplot.pdf",
    width = 8,
    height = 3)
dotplot(ego_BP, showCategory = 50)
dev.off()
## quartz_off_screen 
##                 2

Final output is BP_enrichplot

Cellular Component
png('GO Analysis/HF_hit_GO dotplot_CC.png',
    width = 1200,
    height = 1200,
    res = 100,
    pointsize = 8)
dotplot(ego_CC, showCategory=41)
dev.off()
## quartz_off_screen 
##                 2
pdf("PDF_figure/GO_hit_HF_peaks_CC_dotplot.pdf",
    width = 8,
    height = 10)
dotplot(ego_CC, showCategory=41)
dev.off()
## quartz_off_screen 
##                 2

Final output for Cellular Component is CC_dotplot

png('GO Analysis/HF_hit_GO enrichment_CC.png',
    width = 1200,
    height = 1200,
    res = 100,
    pointsize = 8)
emapplot(ego_CC, showCategory = 50)
dev.off()
## quartz_off_screen 
##                 2
pdf("PDF_figure/GO_hit_HF_peaks_CC_emap.pdf",
    width = 8,
    height = 8)
emapplot(ego_CC, showCategory=41)
dev.off()
## quartz_off_screen 
##                 2

Final output is CC_enrichplot

Molecular Function
png('GO Analysis/HF_hit_GO dotplot_MF.png',
    width = 800,
    height = 400,
    res = 100,
    pointsize = 8)
dotplot(ego_MF, showCategory=41)
dev.off()
## quartz_off_screen 
##                 2
pdf("PDF_figure/GO_hit_HF_peaks_MF_dotplot.pdf",
    width = 8,
    height = 4)
dotplot(ego_MF, showCategory=41)
dev.off()
## quartz_off_screen 
##                 2

Final output for Cellular Component is MF_dotplot

png('GO Analysis/HF_hit_GO enrichment_MF.png',
    width = 1200,
    height = 1200,
    res = 100,
    pointsize = 8)
emapplot(ego_MF, showCategory = 50)
dev.off()
## quartz_off_screen 
##                 2
pdf("PDF_figure/GO_hit_HF_peaks_MF_emap.pdf",
    width = 8,
    height = 8)
emapplot(ego_MF, showCategory=41)
dev.off()
## quartz_off_screen 
##                 2

Final output is MF_enrichplot

HFK

all HFK

hfk.peaks.go <- as.data.frame(hfk.peaks)
# annotate the list with potential target gene and Ensembl ID
hf.peaks.go$'target_gene' <- NA
for (i in 1:dim(hfk.peaks.go)) {
  if (!is.na(hfk.peaks.go[i,12]) | !is.na(hfk.peaks.go[i,13])) {
    gene_name <- unique(unlist(hfk.peaks.go[i,12:13]))
    gene_name <- gene_name[!is.na(gene_name)] 
    hfk.peaks.go$'target_gene'[i] <- paste(unlist(gene_name), collapse = " ")
  }
  else {
    gene_name <- unique(unlist(hfk.peaks.go[i,8:11]))
    gene_name <- gene_name[!is.na(gene_name)]
    if (length(gene_name) >0) {
    hfk.peaks.go$'target_gene'[i] <- paste(unlist(gene_name), collapse = " ")
    }
  }
}
## Warning in 1:dim(hfk.peaks.go): numerical expression has 2 elements: only the
## first used
hfk_gene_list <- as.data.frame(table(hfk.peaks.go$target_gene))
# annotate the overlap peak list with Ensembl ID
annotations_hfk_all <- AnnotationDbi::select(EnsDb.Mmusculus.v79,
                                           keys = as.character(hfk_gene_list$Var1),
                                           columns = c("GENEID"),
                                           keytype = "GENENAME")

# Determine the indices for the non-duplicated genes
non_duplicates_idx <- which(duplicated(annotations_hfk_all$GENENAME) == FALSE)
non_duplicates_idx <- which(duplicated(annotations_hfk_all$GENEID) == FALSE)

# Return only the non-duplicated genes using indices
annotations_hfk_all <- annotations_hfk_all[non_duplicates_idx, ]

# Check number of NAs returned
is.na(annotations_hfk_all$GENENAME) %>%
  which() %>%
  length()
## [1] 0
# annotate the dataset with Ensembl ID and Uniprot ID from BioMart
hfk.peaks.go <- inner_join(hfk.peaks.go, annotations_hfk_all, by=c("target_gene"="GENENAME"))

# GO analysis
target_gene <- as.character(unique(hfk.peaks.go$GENEID))

# Run GO enrichment analysis for biological process
ego_BP <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "ENSEMBL",
                OrgDb = org.Mm.eg.db, 
                ont = "BP", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_BP <- data.frame(ego_BP)

write.csv(cluster_summary_BP, "GO Analysis/HFK_GO analysis_BP.csv")

# Run GO enrichment analysis for molecular function 
ego_MF <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "ENSEMBL",
                OrgDb = org.Mm.eg.db, 
                ont = "MF", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_MF <- data.frame(ego_MF)

write.csv(cluster_summary_MF, "GO Analysis/HFK_GO analysis_MF.csv")

# Run GO enrichment analysis for cellular component 
ego_CC <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "ENSEMBL",
                OrgDb = org.Mm.eg.db, 
                ont = "CC", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_CC <- data.frame(ego_CC)

write.csv(cluster_summary_CC, "GO Analysis/HFK_GO analysis_CC.csv")

HFK-specific

# GO analysis
target_gene <- as.character(unique(hfk.uniq.peaks$GENEID))

# Run GO enrichment analysis for biological process
ego_BP <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "ENSEMBL",
                OrgDb = org.Mm.eg.db, 
                ont = "BP", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_BP <- data.frame(ego_BP)

write.csv(cluster_summary_BP, "GO Analysis/HFK_specific_GO analysis_BP.csv")

# Run GO enrichment analysis for molecular function 
ego_MF <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "ENSEMBL",
                OrgDb = org.Mm.eg.db, 
                ont = "MF", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_MF <- data.frame(ego_MF)

write.csv(cluster_summary_MF, "GO Analysis/HFK_specific_GO analysis_MF.csv")

# Run GO enrichment analysis for cellular component 
ego_CC <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "ENSEMBL",
                OrgDb = org.Mm.eg.db, 
                ont = "CC", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_CC <- data.frame(ego_CC)

write.csv(cluster_summary_CC, "GO Analysis/HFK_specific_GO analysis_CC.csv")

hit HFK

# GO analysis
target_gene <- as.character(unique(hit_hfk_peak_list$GENEID))

# Run GO enrichment analysis for biological process
ego_BP <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "ENSEMBL",
                OrgDb = org.Mm.eg.db, 
                ont = "BP", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_BP <- data.frame(ego_BP)

write.csv(cluster_summary_BP, "GO Analysis/HFK_hit_GO analysis_BP.csv")

# Run GO enrichment analysis for molecular function 
ego_MF <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "ENSEMBL",
                OrgDb = org.Mm.eg.db, 
                ont = "MF", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_MF <- data.frame(ego_MF)

write.csv(cluster_summary_MF, "GO Analysis/HFK_hit_GO analysis_MF.csv")

# Run GO enrichment analysis for cellular component 
ego_CC <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "ENSEMBL",
                OrgDb = org.Mm.eg.db, 
                ont = "CC", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_CC <- data.frame(ego_CC)

write.csv(cluster_summary_CC, "GO Analysis/HFK_hit_GO analysis_CC.csv")
Cellular Component
png('GO Analysis/HFK_hit_GO dotplot_CC.png',
    width = 1200,
    height = 600,
    res = 100,
    pointsize = 8)
dotplot(ego_CC, showCategory=50)
dev.off()
## quartz_off_screen 
##                 2
pdf("PDF_figure/GO_hit_HFK_peaks_CC_dotplot.pdf",
    width = 8,
    height = 3)
dotplot(ego_CC, showCategory=41)
dev.off()
## quartz_off_screen 
##                 2

Final output is CC_dotplot

SessionInfo

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.4
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
##  [1] grid      stats4    parallel  stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] pathview_1.28.1             DOSE_3.14.0                
##  [3] clusterProfiler_3.16.1      mosaic_1.8.3               
##  [5] ggridges_0.5.3              mosaicData_0.20.2          
##  [7] ggformula_0.10.1            ggstance_0.3.5             
##  [9] Matrix_1.3-4                lattice_0.20-44            
## [11] gridExtra_2.3               ggseqlogo_0.1              
## [13] rvest_1.0.0                 ggrepel_0.9.1              
## [15] pheatmap_1.0.12             gplots_3.1.1               
## [17] RColorBrewer_1.1-2          CLIPanalyze_0.0.10         
## [19] GenomicAlignments_1.24.0    plyr_1.8.6                 
## [21] VennDiagram_1.6.20          futile.logger_1.4.3        
## [23] Rsubread_2.2.6              DESeq2_1.28.1              
## [25] SummarizedExperiment_1.18.2 DelayedArray_0.14.1        
## [27] matrixStats_0.59.0          Rsamtools_2.4.0            
## [29] Biostrings_2.56.0           XVector_0.28.0             
## [31] rtracklayer_1.48.0          forcats_0.5.1              
## [33] stringr_1.4.0               dplyr_1.0.6                
## [35] purrr_0.3.4                 readr_1.4.0                
## [37] tidyr_1.1.3                 tibble_3.1.2               
## [39] ggplot2_3.3.3               tidyverse_1.3.1            
## [41] data.table_1.14.0           org.Mm.eg.db_3.11.4        
## [43] EnsDb.Mmusculus.v79_2.99.0  ensembldb_2.12.1           
## [45] AnnotationFilter_1.12.0     GenomicFeatures_1.40.1     
## [47] AnnotationDbi_1.50.3        Biobase_2.48.0             
## [49] GenomicRanges_1.40.0        GenomeInfoDb_1.24.2        
## [51] IRanges_2.22.2              S4Vectors_0.26.1           
## [53] BiocGenerics_0.34.0        
## 
## loaded via a namespace (and not attached):
##   [1] BSgenome.Mmusculus.UCSC.mm10_1.4.0 utf8_1.2.1                        
##   [3] tidyselect_1.1.1                   RSQLite_2.2.7                     
##   [5] htmlwidgets_1.5.3                  BiocParallel_1.22.0               
##   [7] scatterpie_0.1.6                   munsell_0.5.0                     
##   [9] withr_2.4.2                        colorspace_2.0-1                  
##  [11] GOSemSim_2.14.2                    biosignals_0.1.0                  
##  [13] highr_0.9                          knitr_1.33                        
##  [15] rstudioapi_0.13                    labeling_0.4.2                    
##  [17] KEGGgraph_1.48.0                   urltools_1.7.3                    
##  [19] GenomeInfoDbData_1.2.3             polyclip_1.10-0                   
##  [21] bit64_4.0.5                        farver_2.1.0                      
##  [23] downloader_0.4                     vctrs_0.3.8                       
##  [25] generics_0.1.0                     lambda.r_1.2.4                    
##  [27] xfun_0.23                          BiocFileCache_1.12.1              
##  [29] R6_2.5.0                           graphlayouts_0.7.1                
##  [31] locfit_1.5-9.4                     gridGraphics_0.5-1                
##  [33] bitops_1.0-7                       cachem_1.0.5                      
##  [35] fgsea_1.14.0                       assertthat_0.2.1                  
##  [37] scales_1.1.1                       ggraph_2.0.5                      
##  [39] enrichplot_1.8.1                   gtable_0.3.0                      
##  [41] tidygraph_1.2.0                    rlang_0.4.11                      
##  [43] genefilter_1.70.0                  splines_4.0.3                     
##  [45] lazyeval_0.2.2                     selectr_0.4-2                     
##  [47] europepmc_0.4                      broom_0.7.7                       
##  [49] mosaicCore_0.9.0                   BiocManager_1.30.15               
##  [51] yaml_2.2.1                         reshape2_1.4.4                    
##  [53] modelr_0.1.8                       crosstalk_1.1.1                   
##  [55] backports_1.2.1                    qvalue_2.20.0                     
##  [57] tools_4.0.3                        ggplotify_0.0.7                   
##  [59] ellipsis_0.3.2                     jquerylib_0.1.4                   
##  [61] ggdendro_0.1.22                    Rcpp_1.0.6                        
##  [63] progress_1.2.2                     zlibbioc_1.34.0                   
##  [65] RCurl_1.98-1.3                     prettyunits_1.1.1                 
##  [67] openssl_1.4.4                      viridis_0.6.1                     
##  [69] cowplot_1.1.1                      haven_2.4.1                       
##  [71] fs_1.5.0                           magrittr_2.0.1                    
##  [73] futile.options_1.0.1               DO.db_2.9                         
##  [75] triebeard_0.3.0                    reprex_2.0.0                      
##  [77] ProtGenerics_1.20.0                hms_1.1.0                         
##  [79] evaluate_0.14                      xtable_1.8-4                      
##  [81] XML_3.99-0.6                       leaflet_2.0.4.1                   
##  [83] readxl_1.3.1                       compiler_4.0.3                    
##  [85] biomaRt_2.44.4                     KernSmooth_2.23-20                
##  [87] crayon_1.4.1                       htmltools_0.5.1.1                 
##  [89] geneplotter_1.66.0                 lubridate_1.7.10                  
##  [91] DBI_1.1.1                          tweenr_1.0.2                      
##  [93] formatR_1.11                       dbplyr_2.1.1                      
##  [95] MASS_7.3-54                        rappdirs_0.3.3                    
##  [97] cli_2.5.0                          igraph_1.2.6                      
##  [99] pkgconfig_2.0.3                    rvcheck_0.1.8                     
## [101] xml2_1.3.2                         annotate_1.66.0                   
## [103] bslib_0.2.5.1                      digest_0.6.27                     
## [105] graph_1.66.0                       rmarkdown_2.9                     
## [107] cellranger_1.1.0                   fastmatch_1.1-0                   
## [109] curl_4.3.1                         gtools_3.9.2                      
## [111] lifecycle_1.0.0                    jsonlite_1.7.2                    
## [113] viridisLite_0.4.0                  askpass_1.1                       
## [115] BSgenome_1.56.0                    fansi_0.5.0                       
## [117] labelled_2.8.0                     pillar_1.6.1                      
## [119] KEGGREST_1.28.0                    fastmap_1.1.0                     
## [121] httr_1.4.2                         survival_3.2-11                   
## [123] GO.db_3.11.4                       glue_1.4.2                        
## [125] png_0.1-7                          Rgraphviz_2.32.0                  
## [127] bit_4.0.4                          ggforce_0.3.3                     
## [129] stringi_1.6.2                      sass_0.4.0                        
## [131] blob_1.2.1                         org.Hs.eg.db_3.11.4               
## [133] caTools_1.18.2                     memoise_2.0.0